In [1]:
import re
import json
import codecs

from datetime import datetime
from collections import Counter
from operator import itemgetter

import numpy as np
import pandas as pd

import gmplot
import geopy.distance

import matplotlib.pyplot as plt

%matplotlib inline

In [2]:
dtypes = {
    'amount': np.float32,
    'atm_address': str,
    'atm_address_lat': np.float32,
    'atm_address_lon': np.float32,
    'city': str,
    'country': str,
    'currency': np.float32,
    'customer_id': str,
    'home_add_lat': np.float32,
    'home_add_lon': np.float32,
    'mcc': str,
    'pos_address': str,
    'pos_address_lat': np.float32,
    'pos_address_lon': np.float32,
    'terminal_id': str,
    'transaction_date': str,
    'work_add_lat': np.float32,
    'work_add_lon': np.float32
}

In [3]:
use_columns = dtypes.keys()

df_1 = pd.read_csv("train_set.csv", sep=',', encoding='utf-8', dtype=dtypes, usecols=use_columns)
df_1["is_train"] = True
df_1.head()

Unnamed: 0,amount,atm_address,atm_address_lat,atm_address_lon,city,country,currency,customer_id,home_add_lat,home_add_lon,mcc,pos_address,pos_address_lat,pos_address_lon,terminal_id,transaction_date,work_add_lat,work_add_lon,is_train
0,2.884034,,,,ST PETERSBURG,RUS,643.0,0dc0137d280a2a82d2dc89282450ff1b,59.851002,30.232,5261,,59.844074,30.179153,11606fde0c814ce78e0d726e39a0a5ee,2017-07-15,59.847,30.177,True
1,2.775633,,,,ST PETERSBURG,RUS,643.0,0dc0137d280a2a82d2dc89282450ff1b,59.851002,30.232,5261,,59.844074,30.179153,e9647a5e1eacfb06713b6af755ccc595,2017-10-27,59.847,30.177,True
2,3.708368,,,,St Petersburg,RUS,643.0,0dc0137d280a2a82d2dc89282450ff1b,59.851002,30.232,5992,"PR.MARSHALA ZHUKOVA,31St Petersburg190000 7...",59.8582,30.229023,df06c1fcd3718a514535ae822785f716,2017-10-03,59.847,30.177,True
3,2.787498,,,,ST PETERSBURG,RUS,643.0,0dc0137d280a2a82d2dc89282450ff1b,59.851002,30.232,5261,,59.844074,30.179153,6c5e5793ebc984fb72875feffff62854,2017-09-09,59.847,30.177,True
4,2.89251,,,,ST PETERSBURG,RUS,643.0,0dc0137d280a2a82d2dc89282450ff1b,59.851002,30.232,5261,,59.844074,30.179153,0576445d74e374c92c0902e612fca356,2017-07-06,59.847,30.177,True


In [4]:
use_columns = filter(lambda k: not (k.startswith('home_') or k.startswith('work_')), dtypes.iterkeys())

df_2 = pd.read_csv("test_set.csv", sep=',', encoding='utf-8', dtype=dtypes, usecols=use_columns)
df_2["mcc"] = df_2["mcc"].map(lambda x: re.sub(',', '', x))
df_2["is_train"] = False
df_2.head()

Unnamed: 0,amount,atm_address,atm_address_lat,atm_address_lon,city,country,currency,customer_id,mcc,pos_address,pos_address_lat,pos_address_lon,terminal_id,transaction_date,is_train
0,2.211818,,,,IVANTEEVKA,RUS,643.0,00fd410f5c580c8351cafa88d82b60f3,5411,2-1 TOLMACHEVA STRIVANTEEVKA141280 RUSRUS,55.967487,37.913681,ff0476dae4b098a7b16aabe93d4268df,2017-08-24,False
1,1.331379,,,,IVANTEEVKA,RUS,643.0,00fd410f5c580c8351cafa88d82b60f3,4111,"SOVETSKIJ, 32IVANTEEVKA141282 RUSRUS",55.971294,37.905186,7cfd9a60282459d4692ecc85b856072e,2017-08-12,False
2,2.608004,,,,PUSHKINO,RUS,643.0,00fd410f5c580c8351cafa88d82b60f3,5411,"105,KRASOARMEYSKOE SHPUSHKINO141206 RUSRUS",56.01659,37.9091,7e5a532f0029861d8a9c4f0479b9450b,2017-06-17,False
3,1.916752,,,,IVANTEEVKA,RUS,643.0,00fd410f5c580c8351cafa88d82b60f3,5411,"G. IVANTEEVKA, UL.TOLMACHEVA, D.6IVANTEEVKA141...",55.964508,37.937912,2afe7d1bc61b86c449f413bdf2119032,2017-08-12,False
4,1.981067,,,,MOSCOW,RUS,643.0,00fd410f5c580c8351cafa88d82b60f3,5814,5 KOMSOMOLSKAYA SQMOSCOW101000 RUSRUS,55.776802,37.657352,ab4f00601ff1d949afc59ee3f804c79c,2017-04-26,False


In [5]:
df = pd.concat([df_1, df_2], axis=0)

In [6]:
import json
import requests

class YandexGeocoder():
    url = u'https://geocode-maps.yandex.ru/1.x/'
    cache = dict()
    
    def ask(self, address):
        if address not in self.cache:
            r = requests.get(self.url, params={
                    'format': 'json',
                    'geocode': address
                })
            self.cache[address] = r.json()
            
        answer = self.cache[address]
        return answer
    
    def point(self, address):
        # парочка костылей :)
        if address == u'Анапа, мк-н Аэропорт, д.':
            return (45.007, 37.346)
        elif address == u'Москва, дер. Рассказовка, д. 200, Ленинский р-он':
            return (55.631, 37.334)
        elif address == u'Подольск, 42-й км автомагистрали М-2-Крым, влад. 1':
            return (55.431, 37.546)
        
        try:
            r = self.ask(address)
            r = r['response']['GeoObjectCollection']['featureMember'][0]['GeoObject']['Point']['pos']
            return tuple(map(float, r.split())[::-1])
        except:
            print u"Address '{}' is not found!"
            return (np.nan, np.nan)
    
    def dump_cache(self, filename):
        with codecs.open(filename, mode='w', encoding='utf-8') as f_cache:
            json.dump(self.cache, f_cache, separators=(',', ':'))
            
    def load_cache(self, filename):
        with codecs.open(filename, mode='r', encoding='utf-8') as f_cache:
            self.cache = json.load(f_cache)
            
geocoder = YandexGeocoder()
geocoder.load_cache('data/yandex_geocoder.json')

In [7]:
# Адрес на русском -- индикатор того, что точка принадлежит Райффайзену

def is_raiff(s):
    try:
        return re.search(r'[a-zA-Z]', s, flags=re.U) is None
    except TypeError as e:
        return False

In [8]:
mask = df[["atm_address_lat", "atm_address_lon"]].notnull().all(axis=1)
mask = np.logical_and(mask, df["atm_address"].map(is_raiff))

addresses = df.loc[mask, "atm_address"].map(lambda s: re.sub(r'"', '', s, flags=re.U))

coords = np.array(map(geocoder.point, addresses))

df.loc[mask, "atm_address_lat"] = coords[:, 0]
df.loc[mask, "atm_address_lon"] = coords[:, 1]

In [9]:
mask = df[["atm_address_lat", "atm_address_lon"]].notnull().all(axis=1)

df_atms = df.loc[mask, ["terminal_id", "atm_address_lat", "atm_address_lon", "atm_address"]]
df_atms["is_raiff"] = df_atms["atm_address"].map(is_raiff)
df_atms = df_atms.groupby(by='terminal_id').median()
df_atms.sort_values(by=["atm_address_lat", "atm_address_lon"], inplace=True)
df_atms.head()

Unnamed: 0_level_0,atm_address_lat,atm_address_lon,is_raiff
terminal_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
c693dcbafb5e0c1f9a58ac0211d79ed8,42.055,48.294998,False
3ecc362617966cb7f4563c9e4e89b8a1,42.056499,48.297501,False
cd614b782e35146bacbad19e800e0624,42.056999,48.296001,False
fca99d9df0e01bdebca13824e1db1dfe,42.057999,48.296001,False
fabf56c3e89566e248ea94636cead5cb,42.061501,48.290501,False


In [10]:
mask = df[["atm_address_lat", "atm_address_lon"]].notnull().all(axis=1)
df_tmp = df.loc[mask, ["terminal_id", "atm_address_lat", "atm_address_lon", "atm_address"]]
df_tmp = df_tmp[df_tmp["atm_address"].map(is_raiff)]
df_tmp.head()

Unnamed: 0,terminal_id,atm_address_lat,atm_address_lon,atm_address
942660,0004b2a8bef47fa9af780c7679aba5d1,55.740131,37.663799,"Москва, ул. Таганская, д. 17-23"
942661,0004b2a8bef47fa9af780c7679aba5d1,55.740131,37.663799,"Москва, ул. Таганская, д. 17-23"
942662,0004b2a8bef47fa9af780c7679aba5d1,55.740131,37.663799,"Москва, ул. Таганская, д. 17-23"
942663,0004b2a8bef47fa9af780c7679aba5d1,55.740131,37.663799,"Москва, ул. Таганская, д. 17-23"
942664,0004b2a8bef47fa9af780c7679aba5d1,55.740131,37.663799,"Москва, ул. Таганская, д. 17-23"


In [11]:
df_banks = pd.read_csv("data/atm_addresses.csv", sep=',', encoding='utf-8')
df_banks.head()

Unnamed: 0,bank,name,type,address,lat,lon,access24h,city
0,Россельхозбанк,"Администрация СП ""Сельсовет Ахтынский""",atm,"АХТЫ, ЗДАНИЕ СЕЛЬСОВЕТА",41.460488,47.731741,False,AKHTY
1,Россельхозбанк,здание УФК,atm,"РУТУЛ, ЗДАНИЕ УФК",41.536256,47.435614,False,RUTUL
2,Россельхозбанк,Дополнительный офис Касумкент,office,"368760, Республика Дагестан, Сулейман-Стальски...",41.677346,48.152421,False,KASUMKENT
3,Россельхозбанк,доп.офис,atm,"КАСУМКЕНТ, УЛ.СТАЛЬСКОГО,Д.7",41.677581,48.155619,True,KASUMKENT
4,Россельхозбанк,Дополнительный офис Хив,office,"368680, Республика Дагестан, Хивский район, с....",41.75378,47.9276,False,KHIV


In [12]:
coords = df_banks[["lat", "lon"]].values

In [13]:
def get_banks_area(coords, lat, lon, s=0.02):
    lat_l = np.searchsorted(coords[:, 0], lat - s, side='left')
    lat_r = np.searchsorted(coords[:, 0], lat + s, side='right')
    
    index = []
    for lon_l, lon_curr in enumerate(coords[lat_l:lat_r, 1]):
        if lon - s <= lon_curr <= lon + s:
            index.append(lat_l + lon_l)
    
    return index


def filter_results(df_results, lat, lon):    
    df_results["dist"] = map(
        lambda coord: geopy.distance.vincenty(coord, (lat, lon)).km,
        zip(df_results["lat"], df_results["lon"])
    )
    df_results = df_results.loc[df_results["dist"] <= 1.0]
    df_results.sort_values(by="dist", inplace=True)
    return df_results


def get_atm_info(lat, lon, is_raiff):
    i_index = get_banks_area(coords, lat, lon)
    
    df_results = df_banks.loc[i_index]
    df_results = filter_results(df_results, lat, lon)
    
    city = ''
    n_banks = df_results.shape[0]
    
    if n_banks > 0:        
        dist = df_results["dist"].iloc[0]
        if is_raiff:
            bank_chosen = u'Райффайзенбанк'
        else:
            # df_results = df_results[df_results["bank"] != u'Райффайзенбанк']
            bank_chosen = df_results["bank"].iloc[0] if df_results.shape[0] > 0 else 'unknown'
        
        if is_raiff:
            mask = df_results["bank"] == bank_chosen
        else:
            mask = np.logical_and(
                df_results["dist"] <= 0.3,
                df_results["bank"] == bank_chosen
            )
        df_results = df_results[mask]
        
        if df_results.shape[0] > 1:
            i = 1
            while i < df_results.shape[0]:
                if np.abs(df_results["dist"].iloc[0] - df_results["dist"].iloc[i]) > 0.15:
                    break
                i += 1
        
        if df_results.shape[0] > 0:
            city, lat, lon = df_results.iloc[0][["city", "lat", "lon"]]
            i_index = get_banks_area(coords, lat, lon)
    
            df_results_tmp = df_banks.loc[i_index]
            df_results_tmp = filter_results(df_results_tmp, lat, lon)
            n_banks = df_results_tmp.shape[0]
            
            result = {
                'bank': bank_chosen,
                'lat': lat,
                'lon': lon,
                'access24h': (df_results["access24h"] == True).any(),
                'is_office': (df_results["type"] == 'office').any(),
                'n_banks': n_banks,
                'city': city
            }
    
    if df_results.shape[0] == 0:
        result = {
            'bank': u'Райффайзенбанк' if is_raiff else 'unknown',
            'lat': lat,
            'lon': lon,
            'access24h': False,
            'is_office': False,
            'n_banks': n_banks + 1,
            'city': city
        }

    return pd.Series(result)

In [14]:
df_atms_modified = df_atms.apply(
    lambda x: get_atm_info(
        x['atm_address_lat'],
        x['atm_address_lon'],
        x['is_raiff']
    ), axis=1)
df_atms_modified.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Unnamed: 0_level_0,access24h,bank,city,is_office,lat,lon,n_banks
terminal_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
c693dcbafb5e0c1f9a58ac0211d79ed8,False,Россельхозбанк,DERBENT,True,42.056451,48.298047,4
3ecc362617966cb7f4563c9e4e89b8a1,True,Россельхозбанк,DERBENT,True,42.056451,48.298047,4
cd614b782e35146bacbad19e800e0624,False,Россельхозбанк,DERBENT,True,42.056451,48.298047,4
fca99d9df0e01bdebca13824e1db1dfe,False,Россельхозбанк,DERBENT,True,42.056451,48.298047,4
fabf56c3e89566e248ea94636cead5cb,False,unknown,,False,42.061501,48.290501,5


In [15]:
for name, group in df_atms_modified.groupby("bank"):
    print name, group.shape[0]

unknown 1814
Банк Уралсиб 188
Бинбанк 152
Газпромбанк 320
Московский Кредитный Банк 540
Райффайзенбанк 2422
Росбанк 110
Россельхозбанк 172
Энерготрансбанк 18
ЮниКредит Банк 163


In [16]:
df_atms_modified.to_csv("data/atm_features.csv", sep=',', encoding='utf-8', index=True)

In [17]:
geocoder.dump_cache('data/yandex_geocoder.json')

# Отладочная зона

In [18]:
n_offset = 2360
df_atms_modified.iloc[n_offset:n_offset+10]

Unnamed: 0_level_0,access24h,bank,city,is_office,lat,lon,n_banks
terminal_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
9802ee5fbcf94c779c3a083723efc38e,False,Райффайзенбанк,MOSKVA,True,55.702448,37.509199,16
be08e397ca9806405cbe943d7222468c,False,Райффайзенбанк,MOSKVA,True,55.702448,37.509199,16
be99d86eed3004c939d8371d8b6a63aa,False,Райффайзенбанк,MOSKVA,True,55.702448,37.509199,16
1a79cebc4e26e3bf16b243446bf061a3,False,unknown,,False,55.702999,37.324001,1
023415b9328608b0db687f4ad078d541,True,Газпромбанк,MOSKVA,True,55.702768,37.509244,16
8436ccc7ece33933311fe68c3d392693,False,unknown,,False,55.702999,37.661999,16
b511ad6067892e073dac29702b47c9e4,False,unknown,,False,55.702999,37.681999,4
976d9d9c3061417125432139f9432cb3,False,unknown,,False,55.702999,37.855999,11
ac038e39decaf94cd147c2867ea6a1fa,True,Россельхозбанк,MOSKVA,False,55.704368,37.657696,12
2039586af14d06bc2eac588515fbd7f6,False,Газпромбанк,MOSKVA,False,55.705381,37.47916,1


In [19]:
df[df["terminal_id"] == "677240ffb403f1cefbeac1f7a6672b13"]

Unnamed: 0,amount,atm_address,atm_address_lat,atm_address_lon,city,country,currency,customer_id,home_add_lat,home_add_lon,is_train,mcc,pos_address,pos_address_lat,pos_address_lon,terminal_id,transaction_date,work_add_lat,work_add_lon
1001050,3.601877,"Москва, пер. Вознесенский, д. 7",55.758057,37.604233,MOSKVA,RUS,643.0,68647e69272ab5efd72a3bd6a46a5c93,55.841000,37.389999,True,6011,,,,677240ffb403f1cefbeac1f7a6672b13,2017-08-04,55.759998,37.601002
1001051,3.588972,"Москва, пер. Вознесенский, д. 7",55.758057,37.604233,MOSKVA,RUS,643.0,68647e69272ab5efd72a3bd6a46a5c93,55.841000,37.389999,True,6011,,,,677240ffb403f1cefbeac1f7a6672b13,2017-05-22,55.759998,37.601002
1001052,3.586941,"Москва, пер. Вознесенский, д. 7",55.758057,37.604233,MOSKVA,RUS,643.0,68647e69272ab5efd72a3bd6a46a5c93,55.841000,37.389999,True,6011,,,,677240ffb403f1cefbeac1f7a6672b13,2017-05-22,55.759998,37.601002
1001053,3.596629,"Москва, пер. Вознесенский, д. 7",55.758057,37.604233,MOSKVA,RUS,643.0,68647e69272ab5efd72a3bd6a46a5c93,55.841000,37.389999,True,6011,,,,677240ffb403f1cefbeac1f7a6672b13,2017-02-21,55.759998,37.601002
1001054,3.172686,"Москва, пер. Вознесенский, д. 7",55.758057,37.604233,MOSKVA,RUS,643.0,13a53cf331528fb95d51d51bc588fdf6,55.625999,37.749001,True,6011,,,,677240ffb403f1cefbeac1f7a6672b13,2017-05-26,55.750000,37.577999
1001055,3.648920,"Москва, пер. Вознесенский, д. 7",55.758057,37.604233,MOSKVA,RUS,643.0,13a53cf331528fb95d51d51bc588fdf6,55.625999,37.749001,True,6011,,,,677240ffb403f1cefbeac1f7a6672b13,2017-03-10,55.750000,37.577999
1001056,3.574145,"Москва, пер. Вознесенский, д. 7",55.758057,37.604233,MOSKVA,RUS,643.0,68647e69272ab5efd72a3bd6a46a5c93,55.841000,37.389999,True,6011,,,,677240ffb403f1cefbeac1f7a6672b13,2017-11-03,55.759998,37.601002
1001057,3.680061,"Москва, пер. Вознесенский, д. 7",55.758057,37.604233,MOSKVA,RUS,643.0,68647e69272ab5efd72a3bd6a46a5c93,55.841000,37.389999,True,6011,,,,677240ffb403f1cefbeac1f7a6672b13,2017-09-21,55.759998,37.601002
1001058,3.587452,"Москва, пер. Вознесенский, д. 7",55.758057,37.604233,MOSKVA,RUS,643.0,68647e69272ab5efd72a3bd6a46a5c93,55.841000,37.389999,True,6011,,,,677240ffb403f1cefbeac1f7a6672b13,2017-04-21,55.759998,37.601002
1001059,3.611631,"Москва, пер. Вознесенский, д. 7",55.758057,37.604233,MOSKVA,RUS,643.0,68647e69272ab5efd72a3bd6a46a5c93,55.841000,37.389999,True,6011,,,,677240ffb403f1cefbeac1f7a6672b13,2017-07-24,55.759998,37.601002


In [20]:
terminal_id = "677240ffb403f1cefbeac1f7a6672b13"

lat, lon = df_atms.loc[terminal_id, ["atm_address_lat", "atm_address_lon"]]
print lat, lon
i_index = get_banks_area(coords, lat, lon)
df_results = df_banks.loc[i_index]

df_results["dist"] = map(
    lambda (lat_, lon_): geopy.distance.vincenty((lat_, lon_), (lat, lon)).km,
    zip(df_results["lat"], df_results["lon"])
)
df_results = df_results.sort_values(by="dist")
df_results

55.7581 37.6042


Unnamed: 0,bank,name,type,address,lat,lon,access24h,city,dist
11335,Газпромбанк,"ГУП ""ФХУ Комплекса архитектуры, строительства,...",atm,"Москва, Москва, Никитский пер., д. 5, стр. 6",55.757070,37.609829,False,MOSKVA,0.368118
11377,Росбанк,Банкомат,atm,"г. Москва, Тверской бул., д. 13/1",55.760525,37.599355,True,MOSKVA,0.411474
11378,Райффайзенбанк,Банкомат,atm,"г. Москва, бул. Тверской, д. 13, стр. 1",55.760593,37.599404,False,MOSKVA,0.414309
11363,Банк Уралсиб,"Банкомат. ТЦ ""Галерея Тверская 9""",atm,"Москва и область, город Москва, улица Тверская, 9",55.759856,37.610548,False,MOSKVA,0.444213
11382,Энерготрансбанк,Терминал офисный,atm,"ул. Большая Бронная, 8, стр.1, г. Москва",55.761173,37.599507,False,MOSKVA,0.456516
11307,Райффайзенбанк,Отделение &laquo;Романов Двор&raquo;,office,"125009, г. Москва, пер. Романов, д. 4",55.754725,37.608922,False,MOSKVA,0.473604
11306,Райффайзенбанк,Банкоматы,atm,"г. Москва, пер. Романов, д. 4/4",55.754711,37.608929,True,MOSKVA,0.475064
11370,Московский Кредитный Банк,Перекресток Экспресс,atm,"г. Москва, ул. Тверская, д. 6, стр. 1",55.760241,37.611347,False,MOSKVA,0.508554
11406,Росбанк,Банкоматы,atm,"г. Москва, ул. Тверская, д. 15",55.761970,37.608542,False,MOSKVA,0.512834
11407,Росбанк,Отделение &laquo;Территориальный офис Северный...,office,"103009, г. Москва, ул. Тверская, д. 15",55.762155,37.608311,False,MOSKVA,0.523220


In [21]:
df_coords = df.loc[df["terminal_id"] == terminal_id, ['atm_address_lat', 'atm_address_lon']]
df_coords_centers = df_results

gmap = gmplot.GoogleMapPlotter(lat, lon, 14)

gmap.scatter(
    df_coords['atm_address_lat'],
    df_coords['atm_address_lon'],
    'red', size=10, marker=False
)

gmap.scatter(
    df_coords_centers['lat'],
    df_coords_centers['lon'],
    'blue', size=10, marker=False
)

gmap.scatter(
    [lat],
    [lon],
    'green', size=10, marker=False
)

"""
step = 0.02

for i, row in df_coords_centers.iterrows():
    lat, lon = row['atm_address_lat'], row['atm_address_lon']
    
    polygon = np.array([
        [lat - step, lon - step],
        [lat - step, lon + step],
        [lat + step, lon + step],
        [lat + step, lon - step],
        [lat - step, lon - step]
    ])
    gmap.polygon(polygon[:,0], polygon[:,1], 'cornflowerblue', edge_width=1)
"""
    
"""
chosen_points = np.array([
    # [55.622408367, 38.0709021792],
    # [55.6256660589, 38.0701201019],
    # [55.6371965958, 38.105669857],
    [55.6256660588878, 38.0701201018518]
])

gmap.scatter(chosen_points[:,0], chosen_points[:,1], 'green', size=10, marker=False)
"""

gmap.draw('map.html')

In [22]:
%%HTML
<iframe width="100%" height="450" src="map.html?inline=true"></iframe>