In [1]:
import re
import json
import codecs

import time

from datetime import datetime
from collections import Counter
from operator import itemgetter

import numpy as np
import pandas as pd

import gmplot
import geopy.distance

import matplotlib.pyplot as plt

%matplotlib inline

In [2]:
dtypes = {
    'amount': np.float32,
    'atm_address': str,
    'atm_address_lat': np.float32,
    'atm_address_lon': np.float32,
    'city': str,
    'country': str,
    'currency': np.float32,
    'customer_id': str,
    'home_add_lat': np.float32,
    'home_add_lon': np.float32,
    'mcc': str,
    'pos_address': str,
    'pos_address_lat': np.float32,
    'pos_address_lon': np.float32,
    'terminal_id': str,
    'transaction_date': str,
    'work_add_lat': np.float32,
    'work_add_lon': np.float32
}

In [3]:
use_columns = dtypes.keys()

df_1 = pd.read_csv("train_set.csv", sep=',', encoding='utf-8', dtype=dtypes, usecols=use_columns)
df_1["is_train"] = True
df_1.head()

Unnamed: 0,amount,atm_address,atm_address_lat,atm_address_lon,city,country,currency,customer_id,home_add_lat,home_add_lon,mcc,pos_address,pos_address_lat,pos_address_lon,terminal_id,transaction_date,work_add_lat,work_add_lon,is_train
0,2.884034,,,,ST PETERSBURG,RUS,643.0,0dc0137d280a2a82d2dc89282450ff1b,59.851002,30.232,5261,,59.844074,30.179153,11606fde0c814ce78e0d726e39a0a5ee,2017-07-15,59.847,30.177,True
1,2.775633,,,,ST PETERSBURG,RUS,643.0,0dc0137d280a2a82d2dc89282450ff1b,59.851002,30.232,5261,,59.844074,30.179153,e9647a5e1eacfb06713b6af755ccc595,2017-10-27,59.847,30.177,True
2,3.708368,,,,St Petersburg,RUS,643.0,0dc0137d280a2a82d2dc89282450ff1b,59.851002,30.232,5992,"PR.MARSHALA ZHUKOVA,31St Petersburg190000 7...",59.8582,30.229023,df06c1fcd3718a514535ae822785f716,2017-10-03,59.847,30.177,True
3,2.787498,,,,ST PETERSBURG,RUS,643.0,0dc0137d280a2a82d2dc89282450ff1b,59.851002,30.232,5261,,59.844074,30.179153,6c5e5793ebc984fb72875feffff62854,2017-09-09,59.847,30.177,True
4,2.89251,,,,ST PETERSBURG,RUS,643.0,0dc0137d280a2a82d2dc89282450ff1b,59.851002,30.232,5261,,59.844074,30.179153,0576445d74e374c92c0902e612fca356,2017-07-06,59.847,30.177,True


In [4]:
use_columns = filter(lambda k: not (k.startswith('home_') or k.startswith('work_')), dtypes.iterkeys())

df_2 = pd.read_csv("test_set.csv", sep=',', encoding='utf-8', dtype=dtypes, usecols=use_columns)
df_2["mcc"] = df_2["mcc"].map(lambda x: re.sub(',', '', x))
df_2["is_train"] = False
df_2.head()

Unnamed: 0,amount,atm_address,atm_address_lat,atm_address_lon,city,country,currency,customer_id,mcc,pos_address,pos_address_lat,pos_address_lon,terminal_id,transaction_date,is_train
0,2.211818,,,,IVANTEEVKA,RUS,643.0,00fd410f5c580c8351cafa88d82b60f3,5411,2-1 TOLMACHEVA STRIVANTEEVKA141280 RUSRUS,55.967487,37.913681,ff0476dae4b098a7b16aabe93d4268df,2017-08-24,False
1,1.331379,,,,IVANTEEVKA,RUS,643.0,00fd410f5c580c8351cafa88d82b60f3,4111,"SOVETSKIJ, 32IVANTEEVKA141282 RUSRUS",55.971294,37.905186,7cfd9a60282459d4692ecc85b856072e,2017-08-12,False
2,2.608004,,,,PUSHKINO,RUS,643.0,00fd410f5c580c8351cafa88d82b60f3,5411,"105,KRASOARMEYSKOE SHPUSHKINO141206 RUSRUS",56.01659,37.9091,7e5a532f0029861d8a9c4f0479b9450b,2017-06-17,False
3,1.916752,,,,IVANTEEVKA,RUS,643.0,00fd410f5c580c8351cafa88d82b60f3,5411,"G. IVANTEEVKA, UL.TOLMACHEVA, D.6IVANTEEVKA141...",55.964508,37.937912,2afe7d1bc61b86c449f413bdf2119032,2017-08-12,False
4,1.981067,,,,MOSCOW,RUS,643.0,00fd410f5c580c8351cafa88d82b60f3,5814,5 KOMSOMOLSKAYA SQMOSCOW101000 RUSRUS,55.776802,37.657352,ab4f00601ff1d949afc59ee3f804c79c,2017-04-26,False


In [5]:
df = pd.concat([df_1, df_2], axis=0)

In [6]:
# На основе: http://calendar.yoip.ru/holiday/2017-calendar-prazdnikov.html

holidays = {
    '2017-01-01': {'holiday', 'dayoff'},
    '2017-01-02': {'dayoff'},
    '2017-01-03': {'dayoff'},
    '2017-01-04': {'dayoff'},
    '2017-01-05': {'dayoff'},
    '2017-01-06': {'dayoff'},
    '2017-01-07': {'holiday', 'dayoff'},
    '2017-02-14': {'holiday'}, # День св. Валентина
    '2017-02-22': {'short'},
    '2017-02-23': {'holiday', 'dayoff'},
    '2017-02-24': {'dayoff'},
    '2017-03-07': {'short'},
    '2017-03-08': {'holiday', 'dayoff'},
    '2017-05-01': {'holiday', 'dayoff'},
    '2017-05-08': {'dayoff'},
    '2017-05-09': {'holiday', 'dayoff'},
    '2017-06-12': {'holiday', 'dayoff'}, # holiday ?
    '2017-09-01': {'holiday'},  # День знаний
    '2017-10-05': {'holiday'},  # День учителя
    '2017-11-03': {'short'},
    '2017-11-04': {'holiday', 'dayoff'}, # holiday ?
    '2017-11-06': {'dayoff'},
    '2017-11-24': {'holiday'},  # Черная пятница
    '2017-11-27': {'holiday'},  # Кибер-понедельник
    '2017-12-31': {'holiday', 'dayoff'}
}

holidays_set = sorted([
        datetime.strptime(d, '%Y-%m-%d')
        for d, s in holidays.iteritems() if 'holiday' in s
    ])

In [7]:
def get_features_from_date(date_str):
    date_dt = datetime.strptime(date_str, '%Y-%m-%d')
    
    date_features = {
        'transaction_date': date_str,
        'is_weekend': datetime.strptime(date_str, '%Y-%m-%d').isoweekday() > 5,
        'day_of_week':  date_dt.isoweekday(),
        'week_of_year': date_dt.isocalendar()[1],
        'day': int(date_str.split('-')[2]),
        'month': int(date_str.split('-')[1]), 
        'timestamp': time.mktime(date_dt.timetuple())
    }
    
    if date_str not in holidays:
        date_features['is_holiday'] = False
        date_features['is_dayoff'] = False
        date_features['is_short'] = False
    else:
        date_features['is_holiday'] = 'holiday' in holidays[date_str]
        date_features['is_dayoff'] = 'dayoff' in holidays[date_str]
        date_features['is_short'] = 'short' in holidays[date_str]
        
    date_dst = [(date_dt - date_h).days for date_h in holidays_set]
    date_features['days_before_holiday'] = np.abs(np.max(filter(lambda x: x <= 0, date_dst)))
    date_features['days_after_holiday'] = np.min(filter(lambda x: x >= 0, date_dst))
        
    return date_features

date_uniq = sorted(df["transaction_date"].dropna().unique())

df_dates = pd.DataFrame(map(get_features_from_date, date_uniq))
df_dates["timestamp"] = df_dates["timestamp"].astype(int)
df_dates.head()

Unnamed: 0,day,day_of_week,days_after_holiday,days_before_holiday,is_dayoff,is_holiday,is_short,is_weekend,month,timestamp,transaction_date,week_of_year
0,27,5,20,18,False,False,False,False,1,1485464400,2017-01-27,4
1,28,6,21,17,False,False,False,True,1,1485550800,2017-01-28,4
2,29,7,22,16,False,False,False,True,1,1485637200,2017-01-29,4
3,30,1,23,15,False,False,False,False,1,1485723600,2017-01-30,5
4,31,2,24,14,False,False,False,False,1,1485810000,2017-01-31,5


In [8]:
df_dates.iloc[20:30]

Unnamed: 0,day,day_of_week,days_after_holiday,days_before_holiday,is_dayoff,is_holiday,is_short,is_weekend,month,timestamp,transaction_date,week_of_year
20,16,4,2,7,False,False,False,False,2,1487192400,2017-02-16,7
21,17,5,3,6,False,False,False,False,2,1487278800,2017-02-17,7
22,18,6,4,5,False,False,False,True,2,1487365200,2017-02-18,7
23,19,7,5,4,False,False,False,True,2,1487451600,2017-02-19,7
24,20,1,6,3,False,False,False,False,2,1487538000,2017-02-20,8
25,21,2,7,2,False,False,False,False,2,1487624400,2017-02-21,8
26,22,3,8,1,False,False,True,False,2,1487710800,2017-02-22,8
27,23,4,0,0,True,True,False,False,2,1487797200,2017-02-23,8
28,24,5,1,12,True,False,False,False,2,1487883600,2017-02-24,8
29,25,6,2,11,False,False,False,True,2,1487970000,2017-02-25,8


In [9]:
df_dates.to_csv('data/date_features.csv', sep=',', encoding='utf-8', index=False)