In [1]:
import re
import json
import codecs

import time

from datetime import datetime
from collections import Counter, OrderedDict
from operator import itemgetter

import numpy as np
import pandas as pd

import gmplot
import geopy.distance

import matplotlib.pyplot as plt

%matplotlib inline

In [2]:
df_mcc = pd.read_csv('data/mcc_features.csv', sep=',', encoding='utf-8')
df_mcc.set_index('mcc', inplace=True, drop=False)
df_mcc.head()

Unnamed: 0_level_0,mcc,n_home,n_work,p_home,p_work,mcc_group
mcc,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
50,50,False,False,False,False,0
146,146,False,False,False,False,0
160,160,False,False,False,False,0
165,165,False,False,False,False,0
168,168,False,False,False,False,0


In [3]:
dtypes = {
    'amount': np.float32,
    'atm_address': str,
    'atm_address_lat': np.float32,
    'atm_address_lon': np.float32,
    'city': str,
    'country': str,
    'currency': np.float32,
    'customer_id': str,
    'home_add_lat': np.float32,
    'home_add_lon': np.float32,
    'mcc': str,
    'pos_address': str,
    'pos_address_lat': np.float32,
    'pos_address_lon': np.float32,
    'terminal_id': str,
    'transaction_date': str,
    'work_add_lat': np.float32,
    'work_add_lon': np.float32
}

In [4]:
use_columns = dtypes.keys()

df_1 = pd.read_csv("train_set.csv", sep=',', encoding='utf-8', dtype=dtypes, usecols=use_columns)
df_1["is_train"] = True
df_1.head()

Unnamed: 0,amount,atm_address,atm_address_lat,atm_address_lon,city,country,currency,customer_id,home_add_lat,home_add_lon,mcc,pos_address,pos_address_lat,pos_address_lon,terminal_id,transaction_date,work_add_lat,work_add_lon,is_train
0,2.884034,,,,ST PETERSBURG,RUS,643.0,0dc0137d280a2a82d2dc89282450ff1b,59.851002,30.232,5261,,59.844074,30.179153,11606fde0c814ce78e0d726e39a0a5ee,2017-07-15,59.847,30.177,True
1,2.775633,,,,ST PETERSBURG,RUS,643.0,0dc0137d280a2a82d2dc89282450ff1b,59.851002,30.232,5261,,59.844074,30.179153,e9647a5e1eacfb06713b6af755ccc595,2017-10-27,59.847,30.177,True
2,3.708368,,,,St Petersburg,RUS,643.0,0dc0137d280a2a82d2dc89282450ff1b,59.851002,30.232,5992,"PR.MARSHALA ZHUKOVA,31St Petersburg190000 7...",59.8582,30.229023,df06c1fcd3718a514535ae822785f716,2017-10-03,59.847,30.177,True
3,2.787498,,,,ST PETERSBURG,RUS,643.0,0dc0137d280a2a82d2dc89282450ff1b,59.851002,30.232,5261,,59.844074,30.179153,6c5e5793ebc984fb72875feffff62854,2017-09-09,59.847,30.177,True
4,2.89251,,,,ST PETERSBURG,RUS,643.0,0dc0137d280a2a82d2dc89282450ff1b,59.851002,30.232,5261,,59.844074,30.179153,0576445d74e374c92c0902e612fca356,2017-07-06,59.847,30.177,True


In [5]:
use_columns = filter(lambda k: not (k.startswith('home_') or k.startswith('work_')), dtypes.iterkeys())

df_2 = pd.read_csv("test_set.csv", sep=',', encoding='utf-8', dtype=dtypes, usecols=use_columns)
df_2["mcc"] = df_2["mcc"].map(lambda x: re.sub(',', '', x))
df_2["is_train"] = False
df_2.head()

Unnamed: 0,amount,atm_address,atm_address_lat,atm_address_lon,city,country,currency,customer_id,mcc,pos_address,pos_address_lat,pos_address_lon,terminal_id,transaction_date,is_train
0,2.211818,,,,IVANTEEVKA,RUS,643.0,00fd410f5c580c8351cafa88d82b60f3,5411,2-1 TOLMACHEVA STRIVANTEEVKA141280 RUSRUS,55.967487,37.913681,ff0476dae4b098a7b16aabe93d4268df,2017-08-24,False
1,1.331379,,,,IVANTEEVKA,RUS,643.0,00fd410f5c580c8351cafa88d82b60f3,4111,"SOVETSKIJ, 32IVANTEEVKA141282 RUSRUS",55.971294,37.905186,7cfd9a60282459d4692ecc85b856072e,2017-08-12,False
2,2.608004,,,,PUSHKINO,RUS,643.0,00fd410f5c580c8351cafa88d82b60f3,5411,"105,KRASOARMEYSKOE SHPUSHKINO141206 RUSRUS",56.01659,37.9091,7e5a532f0029861d8a9c4f0479b9450b,2017-06-17,False
3,1.916752,,,,IVANTEEVKA,RUS,643.0,00fd410f5c580c8351cafa88d82b60f3,5411,"G. IVANTEEVKA, UL.TOLMACHEVA, D.6IVANTEEVKA141...",55.964508,37.937912,2afe7d1bc61b86c449f413bdf2119032,2017-08-12,False
4,1.981067,,,,MOSCOW,RUS,643.0,00fd410f5c580c8351cafa88d82b60f3,5814,5 KOMSOMOLSKAYA SQMOSCOW101000 RUSRUS,55.776802,37.657352,ab4f00601ff1d949afc59ee3f804c79c,2017-04-26,False


In [6]:
df = pd.concat([df_1, df_2], axis=0)
df["mcc"] = df["mcc"].astype(np.int32)

In [7]:
children_set = {5641, 7032, 8211, 8220, 8351}

pet_set = {742, 5995}

car_set = set(df_mcc.loc[df_mcc["mcc_group"] == df_mcc.loc[5551, "mcc_group"], "mcc"])
car_set.remove(5551)
car_set.add(4784)
car_set.add(8675)
car_set.add(7523)

t = set(filter(
    lambda x: x < 7550,
    df_mcc.loc[df_mcc["mcc_group"] == df_mcc.loc[7531, "mcc_group"], "mcc"]
))
car_set = car_set | t

In [8]:
car_set

{4784,
 5511,
 5521,
 5531,
 5532,
 5533,
 5541,
 5542,
 5561,
 5571,
 5598,
 5599,
 7523,
 7531,
 7534,
 7535,
 7538,
 7542,
 7549,
 8675}

In [9]:
customers_info = []

for user, group in df.groupby("customer_id"):
    user_info = dict()
    
    n_trans = group.shape[0]
    
    mcc_counts = group[["amount", "mcc"]].groupby("mcc").count()
    mcc_unique = set(mcc_counts.index)
    
    for mcc, count in mcc_counts.itertuples():
        try:
            mcc_g = "pct_mcc_g{:02}".format(df_mcc.loc[mcc, "mcc_group"])
        except KeyError:
            mcc_g = "pct_mcc_g00"
            
        if mcc_g not in user_info:
            user_info[mcc_g] = 0
        user_info[mcc_g] += count
        
    for mcc_g in user_info.iterkeys():
        user_info[mcc_g] = round(user_info[mcc_g] / float(n_trans), 3)
    
    user_info.update({
        'has_children': len(mcc_unique & children_set) > 0,
        'has_car': len(mcc_unique & car_set) > 0,
        'has_pet': len(mcc_unique & pet_set) > 0,
        'customer_id': user
    })
    
    customers_info.append(user_info)

In [10]:
df_customers_info = pd.DataFrame(customers_info)
mcc_g_columns = filter(lambda c: re.match('pct_mcc_g', c) is not None, df_customers_info.columns)
df_customers_info[mcc_g_columns] = df_customers_info[mcc_g_columns].fillna(0)
df_customers_info.head()

Unnamed: 0,customer_id,has_car,has_children,has_pet,pct_mcc_g00,pct_mcc_g01,pct_mcc_g02,pct_mcc_g03,pct_mcc_g04,pct_mcc_g05,...,pct_mcc_g11,pct_mcc_g12,pct_mcc_g13,pct_mcc_g14,pct_mcc_g15,pct_mcc_g16,pct_mcc_g17,pct_mcc_g18,pct_mcc_g19,pct_mcc_g20
0,0001f322716470bf9bfc1708f06f00fc,True,False,False,0.68,0.0,0.07,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.05,0.0,0.19,0.0,0.0
1,00021683ccb416637fe9a4cd35e4606e,False,True,True,0.064,0.0,0.0,0.0,0.0,0.013,...,0.0,0.013,0.0,0.026,0.0,0.359,0.0,0.436,0.0,0.0
2,0002d0f8a642272b41c292c12ab6e602,False,False,False,0.143,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.195,0.0,0.662,0.0,0.0
3,0004d182d9fede3ba2534b2d5e5ad27e,False,False,False,0.766,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.019,0.0,0.038,0.0,0.133,0.006,0.0
4,0007297d86e14bd68bd87b1dbdefe302,True,False,False,0.279,0.0,0.004,0.0,0.008,0.0,...,0.0,0.0,0.0,0.008,0.008,0.377,0.0,0.275,0.012,0.0


In [11]:
df_customers_info.to_csv("data/customers_features.csv", sep=',', encoding='utf-8', index=False)