### Description

Data set
- terminal_id идентификационный номер терминала
- customer_id идентификационный номер клиента
- amount количество потраченных средств за транзакцию
- country страна
- currency валюта
- mcc код категории продавца
- transaction_date дата транзакции
- atm_address адрес банкомата
- pos_address адрес установки pos-терминала
- pos_address_lat адрес установки pos-терминала широта
- pos_address_lon адрес установки pos-терминала долгота
- work_add_lat широта работы клиента
- work_add_lon долгота работы клиента
- home_add_lat широта дома клиента
- home_add_lon долгота дома клиента

Вам предстоит предсказать две пары координат `(_HOME_LAT_,_HOME_LON_,_WORK_LAT_,_WORK_LON_)`: работы и дома. В качестве датасета участникам предоставлены истории транзакций клиентов «Райффайзенбанк».

In [1]:
import pandas as pd
import numpy as np

import requests
import time, logging, os


from tqdm._tqdm_notebook import tqdm_notebook

import urllib.request as ur  

from multiprocessing import Pool

import json
from pandas.io.json import json_normalize

### Load part

In [2]:
train = pd.read_csv('data/train_set.csv', encoding = 'utf-8')
test = pd.read_csv('data/test_set.csv', encoding = 'utf-8')
cnt = pd.read_csv('data/all.csv', encoding = 'utf-8')

  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
train.head()

Unnamed: 0,amount,atm_address,atm_address_lat,atm_address_lon,city,country,currency,customer_id,home_add_lat,home_add_lon,mcc,pos_address,pos_adress_lat,pos_adress_lon,terminal_id,transaction_date,work_add_lat,work_add_lon
0,2.884034,,,,ST PETERSBURG,RUS,643.0,0dc0137d280a2a82d2dc89282450ff1b,59.851,30.232,5261,,59.844072,30.179153,11606fde0c814ce78e0d726e39a0a5ee,2017-07-15,59.847,30.177
1,2.775633,,,,ST PETERSBURG,RUS,643.0,0dc0137d280a2a82d2dc89282450ff1b,59.851,30.232,5261,,59.844072,30.179153,e9647a5e1eacfb06713b6af755ccc595,2017-10-27,59.847,30.177
2,3.708368,,,,St Petersburg,RUS,643.0,0dc0137d280a2a82d2dc89282450ff1b,59.851,30.232,5992,"PR.MARSHALA ZHUKOVA,31St Petersburg190000 7...",59.858198,30.229024,df06c1fcd3718a514535ae822785f716,2017-10-03,59.847,30.177
3,2.787498,,,,ST PETERSBURG,RUS,643.0,0dc0137d280a2a82d2dc89282450ff1b,59.851,30.232,5261,,59.844072,30.179153,6c5e5793ebc984fb72875feffff62854,2017-09-09,59.847,30.177
4,2.89251,,,,ST PETERSBURG,RUS,643.0,0dc0137d280a2a82d2dc89282450ff1b,59.851,30.232,5261,,59.844072,30.179153,0576445d74e374c92c0902e612fca356,2017-07-06,59.847,30.177


### Feature extracting part

#### Date split

In [4]:
def holidays(d, m, year):
    time.sleep(0.2)
    elevations = ur.urlopen("https://kayaposoft.com/enrico/json/v2.0/?action=isPublicHoliday&date="+str(d)+'-'+str(m)+'-'+str(year)+"&country=ru").read()
    data = json.loads(elevations)
    if data['isPublicHoliday'] == True:
        return 1
    else:
        return 0

def extr(df):
    df = df.set_value(df[pd.isnull(df['transaction_date'])].index, 'transaction_date', '2017-02-01')
    
    tmp = pd.DataFrame(list(zip(df.transaction_date.unique(), [holidays(d.split('-')[2], d.split('-')[1], d.split('-')[0]) for d in tqdm_notebook(df.transaction_date.unique())])), 
                       columns=['date', 'value'])
    
    df['holiday'] = df['transaction_date'].apply(lambda x: tmp[tmp['date'] == x]['value'].as_matrix()[0])
    df['tm_year'] = df['transaction_date'].apply(lambda x: time.strptime(str(x), '%Y-%m-%d')[0])
    df['tm_mon'] = df['transaction_date'].apply(lambda x:  time.strptime(str(x), '%Y-%m-%d')[1])
    df['tm_mday'] = df['transaction_date'].apply(lambda x: time.strptime(str(x), '%Y-%m-%d')[2])
    df['tm_wday'] = df['transaction_date'].apply(lambda x: time.strptime(str(x), '%Y-%m-%d')[6])
    df['tm_yday'] = df['transaction_date'].apply(lambda x: time.strptime(str(x), '%Y-%m-%d')[7])
    del df['transaction_date']
    return df

train = extr(train)
test = extr(test)

#### Country fix

In [5]:
def cnt_chng(x):
    if x in cnt['alpha-2'].as_matrix():
        return x
    elif x in cnt['alpha-3'].as_matrix():
        return cnt[cnt['alpha-3'] == x]['alpha-2'].as_matrix()[0]
    else:
        return np.nan

train['country'] = train['country'].apply(func = (lambda x: cnt_chng(x.split()[0])))
print ('train don')
test['country'] = test['country'].apply(func = (lambda x: cnt_chng(x.split()[0])))

In [6]:
train = pd.read_csv('data/train_date.csv', encoding = 'utf-8', index_col=0)
test = pd.read_csv('data/test_date.csv', encoding = 'utf-8', index_col=0)

  interactivity=interactivity, compiler=compiler, result=result)
  mask |= (ar1 == a)
  interactivity=interactivity, compiler=compiler, result=result)


#### Geo fix

In [7]:
# configure logging for our tool
if not os.path.exists("logs/"):
    os.makedirs("logs/")
lfh = logging.FileHandler('logs/reverse_geocoder.log', mode='w', encoding='utf-8')
lfh.setFormatter(logging.Formatter('%(levelname)s %(asctime)s %(message)s'))
log = logging.getLogger('reverse_geocoder')
log.setLevel(logging.INFO)
log.addHandler(lfh)
log.info('process started')

In [12]:
def parse_city(geocode_data):
    if (not geocode_data is None) and ('address_components' in geocode_data):
        for component in geocode_data['address_components']:
            if 'administrative_area_level_1' in component['types']:
                return component['long_name']
            elif 'postal_town' in component['types']:
                return component['long_name']
    return np.nan

def parse_country(geocode_data):
    if (not geocode_data is None) and ('address_components' in geocode_data):
        for component in geocode_data['address_components']:
            if 'country' in component['types']:
                return component['short_name']
    return np.nan

def parse_street(geocode_data):
    if (not geocode_data is None) and ('address_components' in geocode_data):
        for component in geocode_data['address_components']:
            if 'route' in component['types']:
                return component['short_name']
    return np.nan

def parse_street_num(geocode_data):
    if (not geocode_data is None) and ('address_components' in geocode_data):
        for component in geocode_data['address_components']:
            if 'street_number' in component['types']:
                return component['short_name']
    return np.nan

def parse_okrug(geocode_data):
    if (not geocode_data is None) and ('address_components' in geocode_data):
        for component in geocode_data['address_components']:
            if 'sublocality' in component['types']:
                return component['short_name']
            elif 'sublocality_level_1' in component['types']:
                return component['short_name']
    return np.nan

def parse_post(geocode_data):
    if (not geocode_data is None) and ('address_components' in geocode_data):
        for component in geocode_data['address_components']:
            if 'postal_code' in component['types']:
                return component['short_name']
    return np.nan

def reverse_geocode(latlng):
    tmp = pd.read_csv('geocode.csv')
    if latlng not in tmp['latlng'].as_matrix():
        time.sleep(0.1)
        url = 'https://maps.googleapis.com/maps/api/geocode/json?latlng={0}&language=en&api_key=AIzaSyDLxUAEWvUfCZZ-rO7qmsn_7_6OoC6katQ'    
        request = url.format(latlng)
        log.info(request)
        response = requests.get(request)
        data = response.json()
        #print(data)
        if 'results' in data and len(data['results']) > 0:
            if data['results'][0] not in tmp['geocode_data'].as_matrix():
                tmp = pd.concat([tmp, pd.concat([pd.DataFrame(pd.Series([data['results'][0]]), columns=['geocode_data']), pd.DataFrame(pd.Series([latlng]), columns=['latlng'])], axis=1)])
                tmp.to_csv('geocode.csv', index=False)
            return data['results'][0]
            
def geo_fix(df):
    df['geocode_data'] = ''
    df['parse_city'] = ''
    df['parse_country'] = ''
    df['parse_street'] = ''
    df['parse_street_num'] = ''
    df['parse_okrug'] = ''
    df['parse_post'] = ''
    
    tqdm_notebook.pandas()
    
    df['latlng'] = df.progress_apply(lambda row: '{},{}'.format(round(row['atm_address_lat'], 3), round(row['atm_address_lon'],3))
                                     if pd.isnull(row['atm_address_lat'])==False and pd.isnull(row['atm_address_lat'])==False
                                     else '{},{}'.format(round(row['pos_adress_lat'], 3), round(row['pos_adress_lon'],3)), axis=1)
    
    
    df['latlng'].progress_map(reverse_geocode)
    tmp = pd.read_csv('geocode.csv')
    df = df.merge(tmp, on = 'latlng')
    df['parse_city'] = df['geocode_data'].progress_map(parse_city)
    df['parse_country'] = df['geocode_data'].progress_map(parse_country)
    df['parse_street'] = df['geocode_data'].progress_map(parse_street)
    df['parse_street_num'] = df['geocode_data'].progress_map(parse_street_num)
    df['parse_okrug'] = df['geocode_data'].progress_map(parse_okrug)
    df['parse_post'] = df['geocode_data'].progress_map(parse_post)
    df.to_csv('city.csv', encoding='utf-8', index=False)
    return df

In [None]:
df = geo_fix(train)