In [1]:
import re
import json
import codecs

from datetime import datetime
from collections import Counter
from operator import itemgetter

import numpy as np
import pandas as pd

import gmplot
import geopy.distance

import matplotlib.pyplot as plt

%matplotlib inline

In [None]:
dtypes = {
    'amount': np.float32,
    'atm_address': str,
    'atm_address_lat': np.float32,
    'atm_address_lon': np.float32,
    'city': str,
    'country': str,
    'currency': np.float32,
    'customer_id': str,
    'home_add_lat': np.float32,
    'home_add_lon': np.float32,
    'mcc': str,
    'pos_address': str,
    'pos_address_lat': np.float32,
    'pos_address_lon': np.float32,
    'terminal_id': str,
    'transaction_date': str,
    'work_add_lat': np.float32,
    'work_add_lon': np.float32
}

In [2]:
use_columns = dtypes.keys()

df_1 = pd.read_csv("train_set.csv", sep=',', encoding='utf-8', dtype=dtypes, usecols=use_columns)
df_1["is_train"] = True
df_1.head()

Unnamed: 0,amount,atm_address,atm_address_lat,atm_address_lon,city,country,currency,customer_id,home_add_lat,home_add_lon,mcc,pos_address,pos_address_lat,pos_address_lon,terminal_id,transaction_date,work_add_lat,work_add_lon,is_train
0,2.884034,,,,ST PETERSBURG,RUS,643.0,0dc0137d280a2a82d2dc89282450ff1b,59.851002,30.232,5261,,59.844074,30.179153,11606fde0c814ce78e0d726e39a0a5ee,2017-07-15,59.847,30.177,True
1,2.775633,,,,ST PETERSBURG,RUS,643.0,0dc0137d280a2a82d2dc89282450ff1b,59.851002,30.232,5261,,59.844074,30.179153,e9647a5e1eacfb06713b6af755ccc595,2017-10-27,59.847,30.177,True
2,3.708368,,,,St Petersburg,RUS,643.0,0dc0137d280a2a82d2dc89282450ff1b,59.851002,30.232,5992,"PR.MARSHALA ZHUKOVA,31St Petersburg190000 7...",59.8582,30.229023,df06c1fcd3718a514535ae822785f716,2017-10-03,59.847,30.177,True
3,2.787498,,,,ST PETERSBURG,RUS,643.0,0dc0137d280a2a82d2dc89282450ff1b,59.851002,30.232,5261,,59.844074,30.179153,6c5e5793ebc984fb72875feffff62854,2017-09-09,59.847,30.177,True
4,2.89251,,,,ST PETERSBURG,RUS,643.0,0dc0137d280a2a82d2dc89282450ff1b,59.851002,30.232,5261,,59.844074,30.179153,0576445d74e374c92c0902e612fca356,2017-07-06,59.847,30.177,True


In [3]:
use_columns = filter(lambda k: not (k.startswith('home_') or k.startswith('work_')), dtypes.iterkeys())

df_2 = pd.read_csv("test_set.csv", sep=',', encoding='utf-8', dtype=dtypes, usecols=use_columns)
df_2["mcc"] = df_2["mcc"].map(lambda x: re.sub(',', '', x))
df_2["is_train"] = False
df_2.head()

Unnamed: 0,amount,atm_address,atm_address_lat,atm_address_lon,city,country,currency,customer_id,mcc,pos_address,pos_address_lat,pos_address_lon,terminal_id,transaction_date,is_train
0,2.211818,,,,IVANTEEVKA,RUS,643.0,00fd410f5c580c8351cafa88d82b60f3,5411,2-1 TOLMACHEVA STRIVANTEEVKA141280 RUSRUS,55.967487,37.913681,ff0476dae4b098a7b16aabe93d4268df,2017-08-24,False
1,1.331379,,,,IVANTEEVKA,RUS,643.0,00fd410f5c580c8351cafa88d82b60f3,4111,"SOVETSKIJ, 32IVANTEEVKA141282 RUSRUS",55.971294,37.905186,7cfd9a60282459d4692ecc85b856072e,2017-08-12,False
2,2.608004,,,,PUSHKINO,RUS,643.0,00fd410f5c580c8351cafa88d82b60f3,5411,"105,KRASOARMEYSKOE SHPUSHKINO141206 RUSRUS",56.01659,37.9091,7e5a532f0029861d8a9c4f0479b9450b,2017-06-17,False
3,1.916752,,,,IVANTEEVKA,RUS,643.0,00fd410f5c580c8351cafa88d82b60f3,5411,"G. IVANTEEVKA, UL.TOLMACHEVA, D.6IVANTEEVKA141...",55.964508,37.937912,2afe7d1bc61b86c449f413bdf2119032,2017-08-12,False
4,1.981067,,,,MOSCOW,RUS,643.0,00fd410f5c580c8351cafa88d82b60f3,5814,5 KOMSOMOLSKAYA SQMOSCOW101000 RUSRUS,55.776802,37.657352,ab4f00601ff1d949afc59ee3f804c79c,2017-04-26,False


In [4]:
df = pd.concat([df_1, df_2], axis=0)
df["mcc"] = df["mcc"].astype(np.int32)

In [5]:
mask = df[['pos_address_lat', 'pos_address_lon']].notnull().all(axis=1)
df_coords = df.loc[mask, ["terminal_id", "mcc", 'pos_address_lat', 'pos_address_lon']].\
    groupby("terminal_id").median()
df_coords["mcc"] = df_coords["mcc"].astype(np.int32)
df_coords.sort_values(by=['pos_address_lat', 'pos_address_lon'], inplace=True)
df_coords.rename(columns={'pos_address_lat':'lat', 'pos_address_lon':'lon'}, inplace=True)
df_coords.head()

Unnamed: 0_level_0,mcc,lat,lon
terminal_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
35cbf8977b0279ea11c9f0c2bb4023f8,5331,42.102032,48.275082
4ed14e29988ba900f4606b7672ee78d3,5499,42.102032,48.275082
6ce9b191529e47eb9ac10e87e2c14033,5814,42.102032,48.275082
7048cc76efcc83cd740fa61ee623abd3,4722,42.102032,48.275082
a163fab651d6ce0c9bfa7b3e305ed731,5814,42.102032,48.275082


In [6]:
coords = df_coords[['lat', 'lon']].values

In [7]:
def get_banks_area(coords, lat, lon, s=0.01):
    lat_l = np.searchsorted(coords[:, 0], lat - s, side='left')
    lat_r = np.searchsorted(coords[:, 0], lat + s, side='right')
    
    index = []
    for lon_l, lon_curr in enumerate(coords[lat_l:lat_r, 1]):
        if lon - s <= lon_curr <= lon + s:
            index.append(lat_l + lon_l)
    
    return index

def filter_results(df_results, lat, lon):
    df_results["dist"] = df_results.apply(
        lambda x: geopy.distance.vincenty((x["lat"], x["lon"]), (lat, lon)).km,
        axis=1
    )
    df_results = df_results.loc[df_results["dist"] <= 1.0]
    df_results.sort_values(by="dist", inplace=True)
    return df_results

df_pos_info = []

for i, term_id in enumerate(df_coords.index):
    lat, lon, mcc = df_coords.loc[term_id, ["lat", "lon", "mcc"]]
    index = get_banks_area(coords, lat, lon)
    df_results = df_coords.iloc[index]
    df_results = filter_results(df_results, lat, lon)
    pos_info = {
        'terminal_id': term_id,
        'lat': lat,
        'lon': lon,
        'mcc': mcc,
        'n_points': df_results.shape[0],
        'n_points_same': sum(df_results["mcc"] == mcc)
    }
    df_pos_info.append(pos_info)
    
    if (i + 1) % 1000 == 0:
        print u'{} of {} processed...'.format(i + 1, df_coords.shape[0])
    
df_pos_info = pd.DataFrame(df_pos_info)
df_pos_info.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


1000 of 255995 processed...
2000 of 255995 processed...
3000 of 255995 processed...
4000 of 255995 processed...
5000 of 255995 processed...
6000 of 255995 processed...
7000 of 255995 processed...
8000 of 255995 processed...
9000 of 255995 processed...
10000 of 255995 processed...
11000 of 255995 processed...
12000 of 255995 processed...
13000 of 255995 processed...
14000 of 255995 processed...
15000 of 255995 processed...
16000 of 255995 processed...
17000 of 255995 processed...
18000 of 255995 processed...
19000 of 255995 processed...
20000 of 255995 processed...
21000 of 255995 processed...
22000 of 255995 processed...
23000 of 255995 processed...
24000 of 255995 processed...
25000 of 255995 processed...
26000 of 255995 processed...
27000 of 255995 processed...
28000 of 255995 processed...
29000 of 255995 processed...
30000 of 255995 processed...
31000 of 255995 processed...
32000 of 255995 processed...
33000 of 255995 processed...
34000 of 255995 processed...
35000 of 255995 process

Unnamed: 0,lat,lon,mcc,n_points,n_points_same,terminal_id
0,42.102032,48.275082,5331.0,9,3,35cbf8977b0279ea11c9f0c2bb4023f8
1,42.102032,48.275082,5499.0,9,1,4ed14e29988ba900f4606b7672ee78d3
2,42.102032,48.275082,5814.0,9,4,6ce9b191529e47eb9ac10e87e2c14033
3,42.102032,48.275082,4722.0,9,1,7048cc76efcc83cd740fa61ee623abd3
4,42.102032,48.275082,5814.0,9,4,a163fab651d6ce0c9bfa7b3e305ed731


In [8]:
df_pos_info.to_csv('data/pos_features.csv', sep=',', index=False, encoding='utf-8')