In [1]:
import re
import json
import codecs

import time

from datetime import datetime
from collections import Counter, OrderedDict
from operator import itemgetter

import numpy as np
import pandas as pd

import gmplot
import geopy.distance

import matplotlib.pyplot as plt

%matplotlib inline

In [2]:
df_mcc = pd.read_csv('data/mcc_codes_ru.csv', sep=',', encoding='utf-8')
df_mcc = df_mcc.set_index('MCC')
df_mcc.head()

Unnamed: 0_level_0,Название,Группа,Обновлено
MCC,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
742,Ветеринарные услуги,Контрактные услуги,31.01.18
763,Сельскохозяйственные кооперативы,Контрактные услуги,31.01.18
780,Услуги садоводства и ландшафтного дизайна,Контрактные услуги,31.01.18
1520,Генеральные подрядчики – жилое и коммерческое ...,Контрактные услуги,31.01.18
1711,"Генеральные подрядчики по вентиляции, теплосна...",Контрактные услуги,31.01.18


In [3]:
dtypes = {
    'amount': np.float32,
    'atm_address': str,
    'atm_address_lat': np.float32,
    'atm_address_lon': np.float32,
    'city': str,
    'country': str,
    'currency': np.float32,
    'customer_id': str,
    'home_add_lat': np.float32,
    'home_add_lon': np.float32,
    'mcc': str,
    'pos_address': str,
    'pos_address_lat': np.float32,
    'pos_address_lon': np.float32,
    'terminal_id': str,
    'transaction_date': str,
    'work_add_lat': np.float32,
    'work_add_lon': np.float32
}

In [4]:
use_columns = dtypes.keys()

df_1 = pd.read_csv("train_set.csv", sep=',', encoding='utf-8', dtype=dtypes, usecols=use_columns)
df_1["is_train"] = True
df_1.head()

Unnamed: 0,amount,atm_address,atm_address_lat,atm_address_lon,city,country,currency,customer_id,home_add_lat,home_add_lon,mcc,pos_address,pos_address_lat,pos_address_lon,terminal_id,transaction_date,work_add_lat,work_add_lon,is_train
0,2.884034,,,,ST PETERSBURG,RUS,643.0,0dc0137d280a2a82d2dc89282450ff1b,59.851002,30.232,5261,,59.844074,30.179153,11606fde0c814ce78e0d726e39a0a5ee,2017-07-15,59.847,30.177,True
1,2.775633,,,,ST PETERSBURG,RUS,643.0,0dc0137d280a2a82d2dc89282450ff1b,59.851002,30.232,5261,,59.844074,30.179153,e9647a5e1eacfb06713b6af755ccc595,2017-10-27,59.847,30.177,True
2,3.708368,,,,St Petersburg,RUS,643.0,0dc0137d280a2a82d2dc89282450ff1b,59.851002,30.232,5992,"PR.MARSHALA ZHUKOVA,31St Petersburg190000 7...",59.8582,30.229023,df06c1fcd3718a514535ae822785f716,2017-10-03,59.847,30.177,True
3,2.787498,,,,ST PETERSBURG,RUS,643.0,0dc0137d280a2a82d2dc89282450ff1b,59.851002,30.232,5261,,59.844074,30.179153,6c5e5793ebc984fb72875feffff62854,2017-09-09,59.847,30.177,True
4,2.89251,,,,ST PETERSBURG,RUS,643.0,0dc0137d280a2a82d2dc89282450ff1b,59.851002,30.232,5261,,59.844074,30.179153,0576445d74e374c92c0902e612fca356,2017-07-06,59.847,30.177,True


In [5]:
use_columns = filter(lambda k: not (k.startswith('home_') or k.startswith('work_')), dtypes.iterkeys())

df_2 = pd.read_csv("test_set.csv", sep=',', encoding='utf-8', dtype=dtypes, usecols=use_columns)
df_2["mcc"] = df_2["mcc"].map(lambda x: re.sub(',', '', x))
df_2["is_train"] = False
df_2.head()

Unnamed: 0,amount,atm_address,atm_address_lat,atm_address_lon,city,country,currency,customer_id,mcc,pos_address,pos_address_lat,pos_address_lon,terminal_id,transaction_date,is_train
0,2.211818,,,,IVANTEEVKA,RUS,643.0,00fd410f5c580c8351cafa88d82b60f3,5411,2-1 TOLMACHEVA STRIVANTEEVKA141280 RUSRUS,55.967487,37.913681,ff0476dae4b098a7b16aabe93d4268df,2017-08-24,False
1,1.331379,,,,IVANTEEVKA,RUS,643.0,00fd410f5c580c8351cafa88d82b60f3,4111,"SOVETSKIJ, 32IVANTEEVKA141282 RUSRUS",55.971294,37.905186,7cfd9a60282459d4692ecc85b856072e,2017-08-12,False
2,2.608004,,,,PUSHKINO,RUS,643.0,00fd410f5c580c8351cafa88d82b60f3,5411,"105,KRASOARMEYSKOE SHPUSHKINO141206 RUSRUS",56.01659,37.9091,7e5a532f0029861d8a9c4f0479b9450b,2017-06-17,False
3,1.916752,,,,IVANTEEVKA,RUS,643.0,00fd410f5c580c8351cafa88d82b60f3,5411,"G. IVANTEEVKA, UL.TOLMACHEVA, D.6IVANTEEVKA141...",55.964508,37.937912,2afe7d1bc61b86c449f413bdf2119032,2017-08-12,False
4,1.981067,,,,MOSCOW,RUS,643.0,00fd410f5c580c8351cafa88d82b60f3,5814,5 KOMSOMOLSKAYA SQMOSCOW101000 RUSRUS,55.776802,37.657352,ab4f00601ff1d949afc59ee3f804c79c,2017-04-26,False


In [6]:
df = pd.concat([df_1, df_2], axis=0)
df["mcc"] = df["mcc"].astype(np.int32)

In [7]:
# mask = np.logical_and(df["mcc"] != 6010, df["mcc"] != 6011)
mask = True

In [8]:
mask_home = np.logical_and(
    df["home_add_lat"].notnull(),
    df["home_add_lon"].notnull()
)
mask_home = np.logical_and(mask_home, mask)

is_home_stats = \
    (df.loc[mask_home, "pos_address_lat"] - df.loc[mask_home, "home_add_lat"]).pow(2) +\
    (df.loc[mask_home, "pos_address_lon"] - df.loc[mask_home, "home_add_lon"]).pow(2) <= 0.02 ** 2

sum(is_home_stats) / float(len(is_home_stats))

0.18308051322835134

In [9]:
mask_work = np.logical_and(
    df["work_add_lat"].notnull(),
    df["work_add_lon"].notnull()
)
mask_work = np.logical_and(mask_work, mask)

is_work_stats = \
    (df.loc[mask_work, "pos_address_lat"] - df.loc[mask_work, "home_add_lat"]).pow(2) +\
    (df.loc[mask_work, "pos_address_lon"] - df.loc[mask_work, "home_add_lon"]).pow(2) <= 0.02 ** 2

sum(is_work_stats) / float(len(is_work_stats))

0.19405259832094604

In [10]:
codes = []

for code, group in df.groupby("mcc"):
    if 6010 <= code <= 6011:
        continue

    # Работа
    mask_work = np.logical_and(
        group["work_add_lat"].notnull(),
        group["work_add_lon"].notnull()
    )
    
    is_work = \
        (group.loc[mask_work, "pos_address_lat"] - group.loc[mask_work, "work_add_lat"]).pow(2) +\
        (group.loc[mask_work, "pos_address_lon"] - group.loc[mask_work, "work_add_lon"]).pow(2) <= 0.02 ** 2
        
    p_is_work = sum(is_work) / float(len(is_work)) if len(is_work) > 0 else np.nan
    
    # Дом
    mask_home = np.logical_and(
        group["home_add_lat"].notnull(),
        group["home_add_lon"].notnull()
    )
    
    is_home = \
        (group.loc[mask_home, "pos_address_lat"] - group.loc[mask_home, "home_add_lat"]).pow(2) +\
        (group.loc[mask_home, "pos_address_lon"] - group.loc[mask_home, "home_add_lon"]).pow(2) <= 0.02 ** 2
        
    p_is_home = sum(is_home) / float(len(is_home)) if len(is_home) > 0 else np.nan

    codes.append((code, p_is_work, p_is_home, group.shape[0]))
    
codes = sorted(codes, key=lambda x: (x[-1], -x[0]), reverse=True)

In [11]:
codes_suspicious = OrderedDict()

for record in codes:
    print "{}\tis_work={:.3f}\tis_home={:.3f}\ttotal={}".format(*record)
    codes_susp_info = {
        'p_work': record[1] > 0.2  and record[3] > 80,
        'n_work': record[1] < 0.03 and record[3] > 80,
        'p_home': record[2] > 0.2  and record[3] > 80,
        'n_home': record[2] < 0.03 and record[3] > 80
    }
    codes_suspicious[record[0]] = codes_susp_info

5411	is_work=0.136	is_home=0.324	total=785960
5814	is_work=0.255	is_home=0.089	total=256733
5812	is_work=0.342	is_home=0.078	total=126508
5541	is_work=0.063	is_home=0.128	total=92953
5499	is_work=0.179	is_home=0.369	total=92242
5912	is_work=0.148	is_home=0.339	total=83924
4111	is_work=0.084	is_home=0.108	total=52044
5921	is_work=0.109	is_home=0.414	total=38009
5331	is_work=0.135	is_home=0.325	total=25499
5261	is_work=0.088	is_home=0.085	total=24591
5691	is_work=0.092	is_home=0.110	total=22674
5977	is_work=0.129	is_home=0.211	total=20082
5999	is_work=0.151	is_home=0.225	total=19254
5311	is_work=0.188	is_home=0.403	total=17684
5211	is_work=0.093	is_home=0.175	total=14030
5641	is_work=0.083	is_home=0.245	total=11923
8099	is_work=0.090	is_home=0.161	total=11004
5651	is_work=0.092	is_home=0.136	total=10831
5995	is_work=0.072	is_home=0.332	total=10679
5661	is_work=0.098	is_home=0.146	total=9262
5945	is_work=0.102	is_home=0.235	total=8701
7230	is_work=0.122	is_home=0.217	total=8582
5533	is_wo

In [12]:
codes_suspicious_i = [code for code, info in codes_suspicious.iteritems() if sum(info.values()) > 0]
df_mcc.loc[codes_suspicious_i]

Unnamed: 0_level_0,Название,Группа,Обновлено
MCC,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
5411,"Бакалейные магазины, супермаркеты",Розничные магазины,01.01.18
5814,Фастфуд,Различные магазины,30.12.17
5812,"Места общественного питания, рестораны",Различные магазины,01.01.18
5499,Различные продовольственные магазины - нигде б...,Розничные магазины,01.01.18
5912,Аптеки,Различные магазины,01.01.18
5921,Магазины с продажей спиртных напитков навынос,Различные магазины,01.01.18
5331,Универсальные магазины,Розничные магазины,01.01.18
5977,Магазины косметики,Различные магазины,01.01.18
5999,Различные магазины и специальные розничные маг...,Различные магазины,01.01.18
5311,Универмаги,Розничные магазины,18.04.17


In [13]:
df_mcc_features = pd.DataFrame(codes_suspicious).transpose()
df_mcc_features = df_mcc_features.join(df_mcc[u"Группа"])
df_mcc_features.rename(columns={u"Группа": 'mcc_group'}, inplace=True)
mcc_groups = sorted(df_mcc_features['mcc_group'].unique())
df_mcc_features['mcc_group'] = map(mcc_groups.index, df_mcc_features['mcc_group'])
df_mcc_features.sort_index(inplace=True)
df_mcc_features.head()

Unnamed: 0,n_home,n_work,p_home,p_work,mcc_group
50,False,False,False,False,0
146,False,False,False,False,0
160,False,False,False,False,0
165,False,False,False,False,0
168,False,False,False,False,0


In [14]:
df_mcc.shape[0], df_mcc_features.shape[0]

(1019, 302)

In [15]:
df_mcc_features.to_csv('data/mcc_features.csv', sep=',', encoding='utf-8', index=True, index_label='mcc')