### Description

Data set
- terminal_id идентификационный номер терминала
- customer_id идентификационный номер клиента
- amount количество потраченных средств за транзакцию
- country страна
- currency валюта
- mcc код категории продавца
- transaction_date дата транзакции
- atm_address адрес банкомата
- pos_address адрес установки pos-терминала
- pos_address_lat адрес установки pos-терминала широта
- pos_address_lon адрес установки pos-терминала долгота
- work_add_lat широта работы клиента
- work_add_lon долгота работы клиента
- home_add_lat широта дома клиента
- home_add_lon долгота дома клиента

Вам предстоит предсказать две пары координат `(_HOME_LAT_,_HOME_LON_,_WORK_LAT_,_WORK_LON_)`: работы и дома. В качестве датасета участникам предоставлены истории транзакций клиентов «Райффайзенбанк».

In [1]:
import pandas as pd
import numpy as np

import holidayapi

import urllib.request as ur
import time

from tqdm import tqdm_notebook
from multiprocessing import Pool

import json
from pandas.io.json import json_normalize

### Load part

In [2]:
train = pd.read_csv('data/train_set.csv', encoding = 'utf-8')
test = pd.read_csv('data/test_set.csv', encoding = 'utf-8')
cnt = pd.read_csv('data/all.csv', encoding = 'utf-8')

  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
train.head()

Unnamed: 0,amount,atm_address,atm_address_lat,atm_address_lon,city,country,currency,customer_id,home_add_lat,home_add_lon,mcc,pos_address,pos_adress_lat,pos_adress_lon,terminal_id,transaction_date,work_add_lat,work_add_lon
0,2.884034,,,,ST PETERSBURG,RUS,643.0,0dc0137d280a2a82d2dc89282450ff1b,59.851,30.232,5261,,59.844072,30.179153,11606fde0c814ce78e0d726e39a0a5ee,2017-07-15,59.847,30.177
1,2.775633,,,,ST PETERSBURG,RUS,643.0,0dc0137d280a2a82d2dc89282450ff1b,59.851,30.232,5261,,59.844072,30.179153,e9647a5e1eacfb06713b6af755ccc595,2017-10-27,59.847,30.177
2,3.708368,,,,St Petersburg,RUS,643.0,0dc0137d280a2a82d2dc89282450ff1b,59.851,30.232,5992,"PR.MARSHALA ZHUKOVA,31St Petersburg190000 7...",59.858198,30.229024,df06c1fcd3718a514535ae822785f716,2017-10-03,59.847,30.177
3,2.787498,,,,ST PETERSBURG,RUS,643.0,0dc0137d280a2a82d2dc89282450ff1b,59.851,30.232,5261,,59.844072,30.179153,6c5e5793ebc984fb72875feffff62854,2017-09-09,59.847,30.177
4,2.89251,,,,ST PETERSBURG,RUS,643.0,0dc0137d280a2a82d2dc89282450ff1b,59.851,30.232,5261,,59.844072,30.179153,0576445d74e374c92c0902e612fca356,2017-07-06,59.847,30.177


### Feature extracting part

#### Date split

In [4]:
def holidays(d, m, year):
    time.sleep(0.2)
    elevations = ur.urlopen("https://kayaposoft.com/enrico/json/v2.0/?action=isPublicHoliday&date="+str(d)+'-'+str(m)+'-'+str(year)+"&country=ru").read()
    data = json.loads(elevations)
    if data['isPublicHoliday'] == True:
        return 1
    else:
        return 0

def extr(df):
    df = df.set_value(df[pd.isnull(df['transaction_date'])].index, 'transaction_date', '2017-02-01')
    
    tmp = pd.DataFrame(list(zip(df.transaction_date.unique(), [holidays(d.split('-')[2], d.split('-')[1], d.split('-')[0]) for d in tqdm_notebook(df.transaction_date.unique())])), 
                       columns=['date', 'value'])
    
    df['holiday'] = df['transaction_date'].apply(lambda x: tmp[tmp['date'] == x]['value'].as_matrix()[0])
    df['tm_year'] = df['transaction_date'].apply(lambda x: time.strptime(str(x), '%Y-%m-%d')[0])
    df['tm_mon'] = df['transaction_date'].apply(lambda x:  time.strptime(str(x), '%Y-%m-%d')[1])
    df['tm_mday'] = df['transaction_date'].apply(lambda x: time.strptime(str(x), '%Y-%m-%d')[2])
    df['tm_wday'] = df['transaction_date'].apply(lambda x: time.strptime(str(x), '%Y-%m-%d')[6])
    df['tm_yday'] = df['transaction_date'].apply(lambda x: time.strptime(str(x), '%Y-%m-%d')[7])
    del df['transaction_date']
    return df

train = extr(train)
test = extr(test)

In [5]:
train = pd.read_csv('data/train_date.csv', encoding = 'utf-8', index_col=0)
test = pd.read_csv('data/test_date.csv', encoding = 'utf-8', index_col=0)

  interactivity=interactivity, compiler=compiler, result=result)
  mask |= (ar1 == a)
  interactivity=interactivity, compiler=compiler, result=result)


#### Country fix

In [6]:
def cnt_chng(x):
    if x in cnt['alpha-2'].as_matrix():
        return x
    elif x in cnt['alpha-3'].as_matrix():
        return cnt[cnt['alpha-3'] == x]['alpha-2'].as_matrix()[0]
    else:
        return np.nan

train['country'] = train['country'].apply(func = (lambda x: cnt_chng(x.split()[0])))
test['country'] = test['country'].apply(func = (lambda x: cnt_chng(x.split()[0])))

#### Geo fix

In [17]:
def adr(adr):
    return 0
    
def pos(cordinates):
    lat, lon = cordinates[0], cordinates[1]
    elevations = ur.urlopen("https://geocode-maps.yandex.ru/1.x/?format=json&geocode=" +str(lon)+","+str(lat)+"&lang=en_US&kind=house&kind=locality").read()
    data = json.loads(elevations)
    df = json_normalize(data['response']['GeoObjectCollection']['featureMember'], errors= 'ignore')
    df.columns = [c.split('.')[len(c.split('.'))-1] for c in df.columns]
    #print (df)
    if len(df)!=0:
        return df.head(1)
    else:
        return pd.DataFrame([np.nan, np.nan, np.nan, np.nan], columns=['name', 'country_code', 'postal_code', 'LocalityName'])

def res(result, df):
    result =  pd.concat(result).reset_index(drop=True)
    df = pd.concat([train, result], axis=1)
    return df

def geo_fix(df):
    pool = Pool(processes=4)
    df = res(pool.map(pos, df[~pd.isnull(train['atm_address_lat'])][['atm_address_lat','atm_address_lon']].as_matrix()), df)
    df = res(pool.map(pos, df[~pd.isnull(train['pos_adress_lon'])][['pos_adress_lat','pos_adress_lon']].as_matrix()), df)
    pool.close()
    pool.join()
    return df

In [None]:
train = geo_fix(train)

In [8]:
train[~pd.isnull(train['atm_address_lat'])][['atm_address_lat','atm_address_lon']].as_matrix()[:10].shape

(10, 2)

In [None]:
train.head()

In [10]:
train[(~np.isnan(train['atm_address_lat'])) & (~np.isnan(train['pos_adress_lat']))]

Unnamed: 0,amount,atm_address,atm_address_lat,atm_address_lon,city,country,currency,customer_id,home_add_lat,home_add_lon,...,pos_adress_lon,terminal_id,work_add_lat,work_add_lon,holiday,tm_year,tm_mon,tm_mday,tm_wday,tm_yday


In [11]:
elevations = ur.urlopen("https://geocode-maps.yandex.ru/1.x/?format=json&geocode=55.744, 37.663&lang=en_US").read()

In [12]:
data = json.loads(elevations)
df = json_normalize(data['response']['GeoObjectCollection']['featureMember'], errors= 'ignore')

In [13]:
df.columns = [c.split('.')[len(c.split('.'))-1] for c in df.columns]
df.columns

Index(['pos', 'lowerCorner', 'upperCorner', 'description', 'Components',
       'country_code', 'formatted', 'AddressLine', 'AdministrativeAreaName',
       'CountryName', 'CountryNameCode', 'kind', 'precision', 'text', 'name'],
      dtype='object')

In [14]:
df[for i in ]

Unnamed: 0,pos,lowerCorner,upperCorner,description,Components,country_code,formatted,AddressLine,AdministrativeAreaName,CountryName,CountryNameCode,kind,precision,text,name
0,55.308428 37.578935,53.723036 36.491486,56.314056 38.111537,Iran,"[{'kind': 'country', 'name': 'Iran'}, {'kind':...",IR,Golestan,Golestan,Golestan,Iran,IR,province,other,"Iran, Golestan",Golestan
1,54.154919 32.046882,44.032702 24.872455,63.317241 39.782107,,"[{'kind': 'country', 'name': 'Iran'}]",IR,,,,Iran,IR,country,other,Iran,Iran


In [None]:
df[['name', 'country_code', 'postal_code', 'ThoroughfareName', 'LocalityName']].as_matrix()[0]

In [None]:
df['AddressLine'].head()

In [None]:
np.dtype?