In [1]:
import codecs

from operator import itemgetter

import numpy as np
import pandas as pd

import xgboost as xgb
import sklearn

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.externals import joblib

In [2]:
def is_inside_area(lat_pred, lon_pred, lat_true, lon_true, radius=0.02):
    return np.power(lat_pred - lat_true, 2) + np.power(lon_pred - lon_true, 2) <= radius ** 2

In [3]:
time_features = {u'day', u'day_of_week', u'days_after_holiday',
       u'days_before_holiday', u'is_dayoff', u'is_holiday', u'is_short',
       u'is_weekend', u'month', u'timestamp', u'week_of_year'}

### Подготавливаем данные

In [3]:
df = pd.read_csv("data/sets/train_set_final.csv", sep=',', encoding='utf-8')
df.drop(labels=time_features, axis=1, inplace=True)
df.drop_duplicates(inplace=True)
print df.shape
df.head()

(1131820, 36)


Unnamed: 0,amount,city,country,currency,customer_id,home_add_lat,home_add_lon,mcc,terminal_id,work_add_lat,...,has_children,has_pet,amount_mean,amount_sub,amount_div,pct_term_counts,pct_mcc_g_counts,is_abroad,amount_true,is_native
0,2.884034,1,0,643,0dc0137d280a2a82d2dc89282450ff1b,59.851002,30.232,5261,11606fde0c814ce78e0d726e39a0a5ee,59.847,...,0,0,3.552742,-0.668708,0.811777,0.013889,0.138889,0,1914.141006,1
1,2.775633,1,0,643,0dc0137d280a2a82d2dc89282450ff1b,59.851002,30.232,5261,e9647a5e1eacfb06713b6af755ccc595,59.847,...,0,0,3.552742,-0.777109,0.781265,0.013889,0.138889,0,1491.327772,1
2,3.708368,1,0,643,0dc0137d280a2a82d2dc89282450ff1b,59.851002,30.232,5992,df06c1fcd3718a514535ae822785f716,59.847,...,0,0,3.552742,0.155626,1.043805,0.013889,0.013889,0,12773.452874,1
4,2.787498,1,0,643,0dc0137d280a2a82d2dc89282450ff1b,59.851002,30.232,5261,6c5e5793ebc984fb72875feffff62854,59.847,...,0,0,3.552742,-0.765244,0.784605,0.013889,0.138889,0,1532.631571,1
6,2.89251,1,0,643,0dc0137d280a2a82d2dc89282450ff1b,59.851002,30.232,5261,0576445d74e374c92c0902e612fca356,59.847,...,0,0,3.552742,-0.660232,0.814163,0.041667,0.138889,0,1951.865761,1


In [4]:
df.to_csv("data/sets/tmp/train_set_final.csv", sep=',', encoding='utf-8', index=False)

In [5]:
np.random.seed(8888)

In [7]:
df = pd.read_csv("data/sets/tmp/train_set_final.csv", sep=',', encoding='utf-8', usecols=['customer_id'])
df.head()

Unnamed: 0,customer_id
0,0dc0137d280a2a82d2dc89282450ff1b
1,0dc0137d280a2a82d2dc89282450ff1b
2,0dc0137d280a2a82d2dc89282450ff1b
3,0dc0137d280a2a82d2dc89282450ff1b
4,0dc0137d280a2a82d2dc89282450ff1b


In [8]:
customers = df['customer_id'].unique()
np.random.shuffle(customers)
    
border = int(len(customers) * 0.9)
cust_train, cust_valid = set(customers[:border]), set(customers[border:])

In [9]:
variants_mods = [(i,j) for i in range(-2, 3) for j in range(-2, 3)]
variants_mods.remove((0,0))

with codecs.open('data/sets/tmp/train_set_final.csv', mode='r', encoding='utf-8') as f_input,\
        codecs.open('data/sets/tmp/valid_work.libsvm', mode='w', encoding='utf-8') as f_valid,\
        codecs.open('data/sets/tmp/train_work.libsvm', mode='w', encoding='utf-8') as f_train,\
        codecs.open('data/sets/tmp/valid_work.labels', mode='w', encoding='utf-8') as f_valid_,\
        codecs.open('data/sets/tmp/train_work.labels', mode='w', encoding='utf-8') as f_train_:
    f_train_.write('customer_id,work_add_lat,work_add_lon,lat,lon\n')
    f_valid_.write('customer_id,work_add_lat,work_add_lon,lat,lon\n')
    
    # получаем индексы для каждой из колонок
    header = next(f_input).strip().split(',')
    header_cols = { col: col_i for col_i, col in enumerate(header) }
    
    # настраиваем список фичей
    features = [f for f in header
        if f not in {'home_add_lon', 'home_add_lat',
              'work_add_lon', 'work_add_lat',
              'is_work', 'is_home',
              'customer_id', 'terminal_id'} and f not in time_features]
    n_features = len(features)
    
    def format_line():
        is_coord = int(is_inside_area(lat, lon, lat_coord, lon_coord))
        output_line = [
            str(is_coord),
            features_line,
            '{}:{}'.format(n_features, lat),
            '{}:{}'.format(n_features + 1, lon)
        ]
        output_line = ' '.join(output_line) + '\n'
        return output_line
    
    for line in f_input:        
        line = line.strip()
        line_cols = line.split(',')
        
        lat_coord, lon_coord = \
            line_cols[header_cols['work_add_lat']], line_cols[header_cols['work_add_lon']]
            
        if lat_coord == '' or lon_coord == '':
            continue
            
        lat_coord, lon_coord = map(float, [lat_coord, lon_coord])
        
        cust_id = line_cols[header_cols['customer_id']]
        
        f_output_ = f_train_ if cust_id in cust_train else f_valid_
        
        f_output = f_train if cust_id in cust_train else f_valid
        
        features_line = [str(f_i)+':'+line_cols[header_cols[f]] for f_i, f in enumerate(features)]
        features_line = ' '.join(features_line)
        
        labels_line = ','.join([cust_id, str(lat_coord), str(lon_coord)])
        
        lat, lon = line_cols[header_cols['lat']], line_cols[header_cols['lon']]
        if lat != '' and lon != '':
            lat, lon = map(float, [lat, lon])
            f_output.write(format_line())
            f_output_.write(labels_line+','+str(lat)+','+str(lon)+'\n')
        
            np.random.shuffle(variants_mods)
            for mul_lat, mul_lon in variants_mods[:4]:
                lat_ = lat + mul_lat * 0.02
                lon_ = lon + mul_lon * 0.02
                f_output.write(format_line())
                f_output_.write(labels_line+','+str(lat_)+','+str(lon_)+'\n')
        
        lat, lon = lat_coord, lon_coord
        lat, lon = map(float, [lat, lon])
        f_output.write(format_line())
        f_output_.write(labels_line+','+str(lat)+','+str(lon)+'\n')
        
        np.random.shuffle(variants_mods)
        for mul_lat, mul_lon in variants_mods[:4]:
            lat_ = lat + mul_lat * 0.02
            lon_ = lon + mul_lon * 0.02
            f_output.write(format_line())
            f_output_.write(labels_line+','+str(lat_)+','+str(lon_)+'\n')

In [None]:
def choose_random_lines(libsvm, labels, frac=0.8):
    with codecs.open(libsvm, mode='r') as f_libsvm,\
            codecs.open(labels, mode='r') as f_labels,\
            codecs.open(libsvm+'_random', mode='w') as f_libsvm_r,\
            codecs.open(labels+'_random', mode='w') as f_labels_r:
        try:
            while True:
                line_1, line_2 = next(f_libsvm), next(f_labels)
                if np.random.random() < frac:
                    f_libsvm_r.write(line_1)
                    f_labels_r.write(line_2)
        except StopIteration:
            pass

In [None]:
choose_random_lines('data/sets/tmp/train_work.libsvm', 'data/sets/tmp/train_work.labels', frac=0.5)

In [None]:
choose_random_lines('data/sets/tmp/valid_work.libsvm', 'data/sets/tmp/valid_work.labels', frac=0.5)

In [None]:
with codecs.open('data/sets/train_set_final.csv', mode='r', encoding='utf-8') as f_input:
    header = next(f_input).strip().split(',')
    header_cols = { col: col_i for col_i, col in enumerate(header) }
    
    # настраиваем список фичей
    features = [f for f in header
        if f not in {'home_add_lon', 'home_add_lat',
              'work_add_lon', 'work_add_lat',
              'is_work', 'is_home',
              'customer_id', 'terminal_id'}]
    n_features = len(features)

### Обучаем модель

In [4]:
train_set = xgb.DMatrix('data/sets/tmp/train_work.libsvm')

In [5]:
valid_set = xgb.DMatrix('data/sets/tmp/valid_work.libsvm')

In [6]:
clf = xgb.XGBClassifier(n_estimators=10, n_jobs=3, max_depth=7, objective='binary:logistic')

model = xgb.train(
    params=clf.get_xgb_params(),
    dtrain=train_set,
    evals=[(train_set, 'train'), (valid_set, 'valid')],
    verbose_eval=1,
    num_boost_round=300,
    early_stopping_rounds=10
)

[0]	train-error:0.321076	valid-error:0.322691
Multiple eval metrics have been passed: 'valid-error' will be used for early stopping.

Will train until valid-error hasn't improved in 10 rounds.
[1]	train-error:0.31477	valid-error:0.318102
[2]	train-error:0.303517	valid-error:0.306104
[3]	train-error:0.271275	valid-error:0.273878
[4]	train-error:0.277378	valid-error:0.284753
[5]	train-error:0.278018	valid-error:0.286441
[6]	train-error:0.251849	valid-error:0.263419
[7]	train-error:0.248241	valid-error:0.261706
[8]	train-error:0.247378	valid-error:0.258838
[9]	train-error:0.244337	valid-error:0.259154
[10]	train-error:0.242197	valid-error:0.25848
[11]	train-error:0.240294	valid-error:0.253467
[12]	train-error:0.238791	valid-error:0.252885
[13]	train-error:0.237559	valid-error:0.249701
[14]	train-error:0.237109	valid-error:0.249534
[15]	train-error:0.233821	valid-error:0.246084
[16]	train-error:0.230929	valid-error:0.244072
[17]	train-error:0.229681	valid-error:0.241794
[18]	train-error:0.

[171]	train-error:0.119589	valid-error:0.183894
[172]	train-error:0.119566	valid-error:0.184251
[173]	train-error:0.119567	valid-error:0.184359
[174]	train-error:0.119128	valid-error:0.184891
[175]	train-error:0.118544	valid-error:0.184559
[176]	train-error:0.118121	valid-error:0.185074
[177]	train-error:0.117923	valid-error:0.185232
[178]	train-error:0.117643	valid-error:0.185257
[179]	train-error:0.11756	valid-error:0.185216
[180]	train-error:0.117403	valid-error:0.184925
[181]	train-error:0.11729	valid-error:0.184883
Stopping. Best iteration:
[171]	train-error:0.119589	valid-error:0.183894



In [8]:
joblib.dump(model, 'data/models/clf_work_04.model')

['data/models/clf_work_04.model']

### Оценка модели 

### Train

In [4]:
model = joblib.load('data/models/clf_work_04.model')

In [5]:
train_set = xgb.DMatrix('data/sets/tmp/train_work.libsvm')

In [6]:
y_pred = model.predict(train_set)
y_pred.shape

(5554290,)

In [7]:
df = pd.read_csv('data/sets/tmp/train_work.labels', sep=',', encoding='utf-8')
df.columns = ['customer_id', 'work_lat', 'work_lon', 'lat', 'lon']
print df.shape
df.head()

(5554290, 5)


Unnamed: 0,customer_id,work_lat,work_lon,lat,lon
0,0dc0137d280a2a82d2dc89282450ff1b,59.847,30.177,59.844074,30.179153
1,0dc0137d280a2a82d2dc89282450ff1b,59.847,30.177,59.844074,30.199153
2,0dc0137d280a2a82d2dc89282450ff1b,59.847,30.177,59.824074,30.179153
3,0dc0137d280a2a82d2dc89282450ff1b,59.847,30.177,59.824074,30.199153
4,0dc0137d280a2a82d2dc89282450ff1b,59.847,30.177,59.864074,30.159153


In [8]:
df['is_work'] = y_pred

In [9]:
df_preds = []

for cust_i, group in df.groupby('customer_id'):
    pred_i = group['is_work'].idxmax()
    home_lat, home_lon, lat, lon = group.loc[pred_i, ['work_lat', 'work_lon', 'lat', 'lon']]
    df_preds.append((cust_i, home_lat, home_lon, lat, lon))
    
df_preds = pd.DataFrame(df_preds, columns=['customer_id', 'work_lat', 'work_lon', 'lat', 'lon'])
df_preds.head()

Unnamed: 0,customer_id,work_lat,work_lon,lat,lon
0,0001f322716470bf9bfc1708f06f00fc,44.735001,37.798,44.735001,37.798
1,000b709c6c6fb1e8efcfd95e57c2a9de,54.983002,82.879997,54.983002,82.879997
2,0031915eb230f772681fb5dc5a8d1c31,55.748001,37.601002,55.748001,37.601002
3,003fa58414cc55531fcc38423bea8f8e,55.800999,37.633999,55.800999,37.633999
4,005194bf7238734eb49c142258c5a263,51.68,39.181,51.68,39.181


In [10]:
score = is_inside_area(df_preds['work_lat'], df_preds['work_lon'], df_preds['lat'], df_preds['lon'])
print "Train score: {}".format(score.sum() / float(score.shape[0]))

Train score: 0.987068965517


### Valid

In [15]:
valid_set = xgb.DMatrix('data/sets/tmp/valid_work.libsvm')

In [16]:
y_pred = model.predict(valid_set)
y_pred.shape

(601380,)

In [17]:
df = pd.read_csv('data/sets/tmp/valid_work.labels', sep=',', encoding='utf-8')
df.columns = ['customer_id', 'work_lat', 'work_lon', 'lat', 'lon']
print df.shape
df.head()

(601380, 5)


Unnamed: 0,customer_id,work_lat,work_lon,lat,lon
0,3209e87b652f46439784c4d254dd5d12,55.786999,37.629002,55.624607,37.852997
1,3209e87b652f46439784c4d254dd5d12,55.786999,37.629002,55.604607,37.832997
2,3209e87b652f46439784c4d254dd5d12,55.786999,37.629002,55.644607,37.832997
3,3209e87b652f46439784c4d254dd5d12,55.786999,37.629002,55.664607,37.872997
4,3209e87b652f46439784c4d254dd5d12,55.786999,37.629002,55.624607,37.812997


In [18]:
df['is_work'] = y_pred

In [19]:
df_preds = []

for cust_i, group in df.groupby('customer_id'):
    pred_i = group['is_work'].idxmax()
    home_lat, home_lon, lat, lon = group.loc[pred_i, ['work_lat', 'work_lon', 'lat', 'lon']]
    df_preds.append((cust_i, home_lat, home_lon, lat, lon))
    
df_preds = pd.DataFrame(df_preds, columns=['customer_id', 'work_lat', 'work_lon', 'lat', 'lon'])
df_preds.head()

Unnamed: 0,customer_id,work_lat,work_lon,lat,lon
0,007525af0bc3ce72137b586a298f7b98,55.743999,37.645,55.743999,37.645
1,00eee313c10e5496006273dcedca8633,60.235001,29.622,60.235001,29.622
2,00f6ac562cf73b8d8b8ab14abc1d1a95,55.771999,37.625,55.771999,37.625
3,0169fd91b58dda2f5da41713387d1796,59.801998,30.33,59.801998,30.33
4,021057dfdf04a4b8bab5528e8dc0f814,55.747002,37.534,55.747002,37.534


In [20]:
score = is_inside_area(df_preds['work_lat'], df_preds['work_lon'], df_preds['lat'], df_preds['lon'])
print "Valid score: {}".format(score.sum() / float(score.shape[0]))

Valid score: 0.984555984556


### FAIR SCORE

In [4]:
model = joblib.load('data/models/clf_work_04.model')

In [5]:
df_valid = pd.read_csv('data/sets/tmp/valid_work.labels', sep=',', encoding='utf-8',
                       usecols=['customer_id', 'work_add_lat', 'work_add_lon'])
df_valid.drop_duplicates(inplace=True)
df_valid.head()

Unnamed: 0,customer_id,work_add_lat,work_add_lon
0,3209e87b652f46439784c4d254dd5d12,55.786999,37.629002
640,9c8cf84c4bae8d5341f4fd06c964e8ed,59.942001,30.327
1300,799b45bc4a4c48a17a6e28a878350420,56.007999,92.772003
1740,a89fca446f4c5dcf68820f22d4a5d424,59.945,30.344
2130,b0535ffedfd0679c7edd0ec137929dd5,48.773998,44.772999


In [6]:
df = pd.read_csv("data/sets/tmp/train_set_final.csv", sep=',', encoding='utf-8')
df.head()

Unnamed: 0,amount,city,country,currency,customer_id,home_add_lat,home_add_lon,mcc,terminal_id,work_add_lat,...,has_children,has_pet,amount_mean,amount_sub,amount_div,pct_term_counts,pct_mcc_g_counts,is_abroad,amount_true,is_native
0,2.884034,1,0,643,0dc0137d280a2a82d2dc89282450ff1b,59.851002,30.232,5261,11606fde0c814ce78e0d726e39a0a5ee,59.847,...,0,0,3.552742,-0.668708,0.811777,0.013889,0.138889,0,1914.141006,1
1,2.775633,1,0,643,0dc0137d280a2a82d2dc89282450ff1b,59.851002,30.232,5261,e9647a5e1eacfb06713b6af755ccc595,59.847,...,0,0,3.552742,-0.777109,0.781265,0.013889,0.138889,0,1491.327772,1
2,3.708368,1,0,643,0dc0137d280a2a82d2dc89282450ff1b,59.851002,30.232,5992,df06c1fcd3718a514535ae822785f716,59.847,...,0,0,3.552742,0.155626,1.043805,0.013889,0.013889,0,12773.452874,1
3,2.787498,1,0,643,0dc0137d280a2a82d2dc89282450ff1b,59.851002,30.232,5261,6c5e5793ebc984fb72875feffff62854,59.847,...,0,0,3.552742,-0.765244,0.784605,0.013889,0.138889,0,1532.631571,1
4,2.89251,1,0,643,0dc0137d280a2a82d2dc89282450ff1b,59.851002,30.232,5261,0576445d74e374c92c0902e612fca356,59.847,...,0,0,3.552742,-0.660232,0.814163,0.041667,0.138889,0,1951.865761,1


In [7]:
df = pd.merge(df_valid.loc[:, ["customer_id"]], df, how='left', on='customer_id', suffixes=('', ''))
df['lat_pred'] = df['lon_pred'] = np.nan
df.head()

Unnamed: 0,customer_id,amount,city,country,currency,home_add_lat,home_add_lon,mcc,terminal_id,work_add_lat,...,amount_mean,amount_sub,amount_div,pct_term_counts,pct_mcc_g_counts,is_abroad,amount_true,is_native,lat_pred,lon_pred
0,3209e87b652f46439784c4d254dd5d12,2.744375,84,0,643,55.629002,37.863998,5411,f1fa9245477a6a5183fd48cdd51f51c4,55.786999,...,3.37804,-0.633665,0.812416,0.010638,0.191489,0,1387.761263,0,,
1,3209e87b652f46439784c4d254dd5d12,3.979987,128,0,643,55.629002,37.863998,5411,a5af0cb2a1c1ba1e589f589495fe8310,55.786999,...,3.37804,0.601947,1.178194,0.021277,0.191489,0,23874.094837,0,,
2,3209e87b652f46439784c4d254dd5d12,3.749979,128,0,643,55.629002,37.863998,5411,6a16f0fa130e2d2475f0356ef37f32b6,55.786999,...,3.37804,0.371939,1.110105,0.010638,0.191489,0,14057.853976,0,,
3,3209e87b652f46439784c4d254dd5d12,2.926858,0,0,643,55.629002,37.863998,5814,3b4607adc8746afd656fee00be2b8822,55.786999,...,3.37804,-0.451181,0.866437,0.042553,0.276596,0,2112.507188,1,,
4,3209e87b652f46439784c4d254dd5d12,2.057587,0,0,643,55.629002,37.863998,4111,27dd8a3db77635f597a65c4ef24d68d5,55.786999,...,3.37804,-1.320452,0.609107,0.010638,0.010638,0,285.448099,1,,


In [8]:
def choose_best(group):
    y_pred = group.loc[:, pred_columns].values
    index = np.unravel_index(np.argmax(y_pred, axis=None), y_pred.shape)
    lat, lon = group.iloc[index[0]].loc[["lat", "lon"]]
    lat_mul, lon_mul = multip[index[1]]
    lat += lat_mul * 0.02
    lon += lon_mul * 0.02
    return pd.Series([lat, lon], index=['lat', 'lon'])

In [9]:
features = [f for f in df.columns
    if f not in {'home_add_lon', 'home_add_lat',
                 'work_add_lon', 'work_add_lat',
                 'is_work', 'is_home',
                 'customer_id', 'terminal_id'} and f not in time_features]
features

[u'amount',
 u'city',
 u'country',
 u'currency',
 u'mcc',
 u'access24h',
 u'is_office',
 u'lat',
 u'lon',
 u'n_points',
 u'is_atm',
 u'is_raiff',
 u'is_partner',
 u'pct_same',
 u'neg_home',
 u'neg_work',
 u'pos_home',
 u'pos_work',
 u'mcc_group',
 u'has_car',
 u'has_children',
 u'has_pet',
 u'amount_mean',
 u'amount_sub',
 u'amount_div',
 u'pct_term_counts',
 u'pct_mcc_g_counts',
 u'is_abroad',
 u'amount_true',
 u'is_native',
 'lat_pred',
 'lon_pred']

In [10]:
multip = [(lat_mul, lon_mul) for lat_mul in range(-2, 3) for lon_mul in range(-2, 3)]

# multip = [(0, 0)]

for i, (lat_mul, lon_mul) in enumerate(multip):
    df.loc[:, "lat_pred"] = df.loc[:, "lat"] * lat_mul
    df.loc[:, "lon_pred"] = df.loc[:, "lon"] * lon_mul
        
    y_pred = model.predict(xgb.DMatrix(df.loc[:, features].values))
    df["pred@{}".format(i)] = y_pred
        
    print i, "predicted..."

pred_columns = filter(lambda c: '@' in c, df.columns)
df_pred = df[['customer_id', 'lat', 'lon'] + pred_columns].groupby('customer_id').apply(choose_best)
df_pred.reset_index(drop=False, inplace=True)
df_pred.head()

0 predicted...
1 predicted...
2 predicted...
3 predicted...
4 predicted...
5 predicted...
6 predicted...
7 predicted...
8 predicted...
9 predicted...
10 predicted...
11 predicted...
12 predicted...
13 predicted...
14 predicted...
15 predicted...
16 predicted...
17 predicted...
18 predicted...
19 predicted...
20 predicted...
21 predicted...
22 predicted...
23 predicted...
24 predicted...


Unnamed: 0,customer_id,lat,lon
0,007525af0bc3ce72137b586a298f7b98,55.973577,37.248717
1,00eee313c10e5496006273dcedca8633,59.793912,30.244891
2,00f6ac562cf73b8d8b8ab14abc1d1a95,59.7963,30.344815
3,0169fd91b58dda2f5da41713387d1796,59.882466,29.746236
4,021057dfdf04a4b8bab5528e8dc0f814,52.633781,103.825707


In [11]:
df = df.loc[:, filter(lambda c: '@' not in c, df.columns)]

In [12]:
df_pred = pd.merge(df_valid, df_pred, how='left', on='customer_id')

In [13]:
score = is_inside_area(df_pred['work_add_lat'], df_pred['work_add_lon'], df_pred['lat'], df_pred['lon'])
print "Valid score: {}".format(score.sum() / float(score.shape[0]))

Valid score: 0.00767754318618
