In [1]:
import codecs

from operator import itemgetter

import numpy as np
import pandas as pd

import xgboost as xgb
import sklearn

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.externals import joblib

In [2]:
def is_inside_area(lat_pred, lon_pred, lat_true, lon_true, radius=0.02):
    return np.power(lat_pred - lat_true, 2) + np.power(lon_pred - lon_true, 2) <= radius ** 2

In [3]:
time_features = {u'day', u'day_of_week', u'days_after_holiday',
       u'days_before_holiday', u'is_dayoff', u'is_holiday', u'is_short',
       u'is_weekend', u'month', u'timestamp', u'week_of_year'}

### Подготавливаем данные

In [3]:
df = pd.read_csv("data/sets/train_set_final.csv", sep=',', encoding='utf-8')
df.drop(labels=time_features, axis=1, inplace=True)
df.drop_duplicates(inplace=True)
print df.shape
df.head()

(1131820, 36)


Unnamed: 0,amount,city,country,currency,customer_id,home_add_lat,home_add_lon,mcc,terminal_id,work_add_lat,...,has_children,has_pet,amount_mean,amount_sub,amount_div,pct_term_counts,pct_mcc_g_counts,is_abroad,amount_true,is_native
0,2.884034,1,0,643,0dc0137d280a2a82d2dc89282450ff1b,59.851002,30.232,5261,11606fde0c814ce78e0d726e39a0a5ee,59.847,...,0,0,3.552742,-0.668708,0.811777,0.013889,0.138889,0,1914.141006,1
1,2.775633,1,0,643,0dc0137d280a2a82d2dc89282450ff1b,59.851002,30.232,5261,e9647a5e1eacfb06713b6af755ccc595,59.847,...,0,0,3.552742,-0.777109,0.781265,0.013889,0.138889,0,1491.327772,1
2,3.708368,1,0,643,0dc0137d280a2a82d2dc89282450ff1b,59.851002,30.232,5992,df06c1fcd3718a514535ae822785f716,59.847,...,0,0,3.552742,0.155626,1.043805,0.013889,0.013889,0,12773.452874,1
4,2.787498,1,0,643,0dc0137d280a2a82d2dc89282450ff1b,59.851002,30.232,5261,6c5e5793ebc984fb72875feffff62854,59.847,...,0,0,3.552742,-0.765244,0.784605,0.013889,0.138889,0,1532.631571,1
6,2.89251,1,0,643,0dc0137d280a2a82d2dc89282450ff1b,59.851002,30.232,5261,0576445d74e374c92c0902e612fca356,59.847,...,0,0,3.552742,-0.660232,0.814163,0.041667,0.138889,0,1951.865761,1


In [4]:
df.to_csv("data/sets/tmp/train_set_final.csv", sep=',', encoding='utf-8', index=False)

In [5]:
np.random.seed(8888)

In [7]:
df = pd.read_csv("data/sets/tmp/train_set_final.csv", sep=',', encoding='utf-8', usecols=['customer_id'])
df.head()

Unnamed: 0,customer_id
0,0dc0137d280a2a82d2dc89282450ff1b
1,0dc0137d280a2a82d2dc89282450ff1b
2,0dc0137d280a2a82d2dc89282450ff1b
3,0dc0137d280a2a82d2dc89282450ff1b
4,0dc0137d280a2a82d2dc89282450ff1b


In [8]:
customers = df['customer_id'].unique()
np.random.shuffle(customers)
    
border = int(len(customers) * 0.9)
cust_train, cust_valid = set(customers[:border]), set(customers[border:])

In [9]:
variants_mods = [(i,j) for i in range(-2, 3) for j in range(-2, 3)]
variants_mods.remove((0,0))

with codecs.open('data/sets/tmp/train_set_final.csv', mode='r', encoding='utf-8') as f_input,\
        codecs.open('data/sets/tmp/valid_home.libsvm', mode='w', encoding='utf-8') as f_valid,\
        codecs.open('data/sets/tmp/train_home.libsvm', mode='w', encoding='utf-8') as f_train,\
        codecs.open('data/sets/tmp/valid_home.labels', mode='w', encoding='utf-8') as f_valid_,\
        codecs.open('data/sets/tmp/train_home.labels', mode='w', encoding='utf-8') as f_train_:
    f_train_.write('customer_id,home_add_lat,home_add_lon,lat,lon\n')
    f_valid_.write('customer_id,home_add_lat,home_add_lon,lat,lon\n')
    
    # получаем индексы для каждой из колонок
    header = next(f_input).strip().split(',')
    header_cols = { col: col_i for col_i, col in enumerate(header) }
    
    # настраиваем список фичей
    features = [f for f in header
        if f not in {'home_add_lon', 'home_add_lat',
              'work_add_lon', 'work_add_lat',
              'is_work', 'is_home',
              'customer_id', 'terminal_id'} and f not in time_features]
    n_features = len(features)
    
    def format_line():
        is_coord = int(is_inside_area(lat, lon, lat_coord, lon_coord))
        output_line = [
            str(is_coord),
            features_line,
            '{}:{}'.format(n_features, lat),
            '{}:{}'.format(n_features + 1, lon)
        ]
        output_line = ' '.join(output_line) + '\n'
        return output_line
    
    for line in f_input:        
        line = line.strip()
        line_cols = line.split(',')
        
        lat_coord, lon_coord = \
            line_cols[header_cols['home_add_lat']], line_cols[header_cols['home_add_lon']]
            
        if lat_coord == '' or lon_coord == '':
            continue
            
        lat_coord, lon_coord = map(float, [lat_coord, lon_coord])
        
        cust_id = line_cols[header_cols['customer_id']]
        
        f_output_ = f_train_ if cust_id in cust_train else f_valid_
        
        f_output = f_train if cust_id in cust_train else f_valid
        
        features_line = [str(f_i)+':'+line_cols[header_cols[f]] for f_i, f in enumerate(features)]
        features_line = ' '.join(features_line)
        
        labels_line = ','.join([cust_id, str(lat_coord), str(lon_coord)])
        
        lat, lon = line_cols[header_cols['lat']], line_cols[header_cols['lon']]
        if lat != '' and lon != '':
            lat, lon = map(float, [lat, lon])
            f_output.write(format_line())
            f_output_.write(labels_line+','+str(lat)+','+str(lon)+'\n')
        
            np.random.shuffle(variants_mods)
            for mul_lat, mul_lon in variants_mods[:4]:
                lat_ = lat + mul_lat * 0.02
                lon_ = lon + mul_lon * 0.02
                f_output.write(format_line())
                f_output_.write(labels_line+','+str(lat_)+','+str(lon_)+'\n')
        
        lat, lon = lat_coord, lon_coord
        lat, lon = map(float, [lat, lon])
        f_output.write(format_line())
        f_output_.write(labels_line+','+str(lat)+','+str(lon)+'\n')
        
        np.random.shuffle(variants_mods)
        for mul_lat, mul_lon in variants_mods[:4]:
            lat_ = lat + mul_lat * 0.02
            lon_ = lon + mul_lon * 0.02
            f_output.write(format_line())
            f_output_.write(labels_line+','+str(lat_)+','+str(lon_)+'\n')

In [None]:
def choose_random_lines(libsvm, labels, frac=0.8):
    with codecs.open(libsvm, mode='r') as f_libsvm,\
            codecs.open(labels, mode='r') as f_labels,\
            codecs.open(libsvm+'_random', mode='w') as f_libsvm_r,\
            codecs.open(labels+'_random', mode='w') as f_labels_r:
        try:
            while True:
                line_1, line_2 = next(f_libsvm), next(f_labels)
                if np.random.random() < frac:
                    f_libsvm_r.write(line_1)
                    f_labels_r.write(line_2)
        except StopIteration:
            pass

In [None]:
choose_random_lines('data/sets/tmp/train_home.libsvm', 'data/sets/tmp/train_home.labels', frac=0.5)

In [None]:
choose_random_lines('data/sets/tmp/valid_home.libsvm', 'data/sets/tmp/valid_home.labels', frac=0.5)

In [None]:
with codecs.open('data/sets/train_set_final.csv', mode='r', encoding='utf-8') as f_input:
    header = next(f_input).strip().split(',')
    header_cols = { col: col_i for col_i, col in enumerate(header) }
    
    # настраиваем список фичей
    features = [f for f in header
        if f not in {'home_add_lon', 'home_add_lat',
              'work_add_lon', 'work_add_lat',
              'is_work', 'is_home',
              'customer_id', 'terminal_id'}]
    n_features = len(features)

### Обучаем модель

In [3]:
train_set = xgb.DMatrix('data/sets/tmp/train_home.libsvm')

In [4]:
valid_set = xgb.DMatrix('data/sets/tmp/valid_home.libsvm')

In [5]:
clf = xgb.XGBClassifier(n_estimators=10, n_jobs=3, max_depth=7, objective='binary:logistic')

model = xgb.train(
    params=clf.get_xgb_params(),
    dtrain=train_set,
    evals=[(train_set, 'train'), (valid_set, 'valid')],
    verbose_eval=1,
    num_boost_round=300,
    early_stopping_rounds=10
)

[0]	train-error:0.320939	valid-error:0.318144
Multiple eval metrics have been passed: 'valid-error' will be used for early stopping.

Will train until valid-error hasn't improved in 10 rounds.
[1]	train-error:0.31907	valid-error:0.317881
[2]	train-error:0.319388	valid-error:0.31827
[3]	train-error:0.314986	valid-error:0.313211
[4]	train-error:0.300575	valid-error:0.298489
[5]	train-error:0.301127	valid-error:0.299757
[6]	train-error:0.289752	valid-error:0.289991
[7]	train-error:0.290403	valid-error:0.289701
[8]	train-error:0.288218	valid-error:0.287432
[9]	train-error:0.282867	valid-error:0.283215
[10]	train-error:0.279541	valid-error:0.280492
[11]	train-error:0.275312	valid-error:0.276302
[12]	train-error:0.274046	valid-error:0.274979
[13]	train-error:0.270216	valid-error:0.271292
[14]	train-error:0.26605	valid-error:0.267274
[15]	train-error:0.264224	valid-error:0.265535
[16]	train-error:0.263018	valid-error:0.264334
[17]	train-error:0.261005	valid-error:0.261775
[18]	train-error:0.2

In [6]:
joblib.dump(model, 'data/models/clf_home_04.model')

['data/models/clf_home_04.model']

### Оценка модели 

In [4]:
model = joblib.load('data/models/clf_home_04.model')

### Train

In [3]:
train_set = xgb.DMatrix('data/sets/tmp/train_home.libsvm')

In [5]:
y_pred = model.predict(train_set)
y_pred.shape

(10214230,)

In [6]:
df = pd.read_csv('data/sets/tmp/train_home.labels', sep=',', encoding='utf-8')
df.columns = ['customer_id', 'home_lat', 'home_lon', 'lat', 'lon']
print df.shape
df.head()

(10214230, 5)


Unnamed: 0,customer_id,home_lat,home_lon,lat,lon
0,0dc0137d280a2a82d2dc89282450ff1b,59.851002,30.232,59.844074,30.179153
1,0dc0137d280a2a82d2dc89282450ff1b,59.851002,30.232,59.844074,30.199153
2,0dc0137d280a2a82d2dc89282450ff1b,59.851002,30.232,59.824074,30.179153
3,0dc0137d280a2a82d2dc89282450ff1b,59.851002,30.232,59.824074,30.199153
4,0dc0137d280a2a82d2dc89282450ff1b,59.851002,30.232,59.864074,30.159153


In [7]:
df['is_home'] = y_pred

In [9]:
df_preds = []

for cust_i, group in df.groupby('customer_id'):
    pred_i = group['is_home'].idxmax()
    home_lat, home_lon, lat, lon = group.loc[pred_i, ['home_lat', 'home_lon', 'lat', 'lon']]
    df_preds.append((cust_i, home_lat, home_lon, lat, lon))
    
df_preds = pd.DataFrame(df_preds, columns=['customer_id', 'home_lat', 'home_lon', 'lat', 'lon'])
df_preds.head()

Unnamed: 0,customer_id,home_lat,home_lon,lat,lon
0,0001f322716470bf9bfc1708f06f00fc,44.708,37.775002,44.708,37.775002
1,0007297d86e14bd68bd87b1dbdefe302,55.799,37.388,55.799,37.388
2,000b709c6c6fb1e8efcfd95e57c2a9de,54.993999,82.863998,54.993999,82.863998
3,0027a7618d97cc9fbda55fac457eaeb7,55.742001,37.575001,55.742001,37.575001
4,002b9f6e118c54f1292e03d1a04d516e,55.693001,37.594002,55.693001,37.594002


In [16]:
score = is_inside_area(df_preds['home_lat'], df_preds['home_lon'], df_preds['lat'], df_preds['lon'])
print "Train score: {}".format(score.sum() / float(score.shape[0]))

Train score: 0.971


### Valid

In [17]:
valid_set = xgb.DMatrix('data/sets/tmp/valid_home.libsvm')

In [21]:
y_pred = model.predict(valid_set)
y_pred.shape

(1103810,)

In [19]:
df = pd.read_csv('data/sets/tmp/valid_home.labels', sep=',', encoding='utf-8')
df.columns = ['customer_id', 'home_lat', 'home_lon', 'lat', 'lon']
print df.shape
df.head()

(1103810, 5)


Unnamed: 0,customer_id,home_lat,home_lon,lat,lon
0,13d194bdf7adf721eeec5c438879da08,56.734001,37.166,55.668335,37.518272
1,13d194bdf7adf721eeec5c438879da08,56.734001,37.166,55.708335,37.518272
2,13d194bdf7adf721eeec5c438879da08,56.734001,37.166,55.708335,37.478272
3,13d194bdf7adf721eeec5c438879da08,56.734001,37.166,55.648335,37.558272
4,13d194bdf7adf721eeec5c438879da08,56.734001,37.166,55.688335,37.478272


In [22]:
df['is_home'] = y_pred

In [23]:
df_preds = []

for cust_i, group in df.groupby('customer_id'):
    pred_i = group['is_home'].idxmax()
    home_lat, home_lon, lat, lon = group.loc[pred_i, ['home_lat', 'home_lon', 'lat', 'lon']]
    df_preds.append((cust_i, home_lat, home_lon, lat, lon))
    
df_preds = pd.DataFrame(df_preds, columns=['customer_id', 'home_lat', 'home_lon', 'lat', 'lon'])
df_preds.head()

Unnamed: 0,customer_id,home_lat,home_lon,lat,lon
0,0037f3de3d890df1022cc760a1dfd9d6,55.622002,37.841999,55.622002,37.841999
1,007525af0bc3ce72137b586a298f7b98,55.821999,37.370998,55.821999,37.370998
2,00a840ccc4f35f8ead26610f788a6a61,59.945,30.495001,59.945,30.495001
3,00cefcb1c777d2bf7a32f082c040088f,55.679001,37.543999,55.679001,37.543999
4,00dc51ccd3993f55974bd7afc1ceecea,55.844002,37.580002,55.844002,37.580002


In [24]:
score = is_inside_area(df_preds['home_lat'], df_preds['home_lon'], df_preds['lat'], df_preds['lon'])
print "Valid score: {}".format(score.sum() / float(score.shape[0]))

Valid score: 0.962


### FAIR SCORE

In [4]:
model = joblib.load('data/models/clf_home_04.model')

In [5]:
df_valid = pd.read_csv('data/sets/tmp/valid_home.labels', sep=',', encoding='utf-8',
                       usecols=['customer_id', 'home_add_lat', 'home_add_lon'])
df_valid.drop_duplicates(inplace=True)
df_valid.head()

Unnamed: 0,customer_id,home_add_lat,home_add_lon
0,13d194bdf7adf721eeec5c438879da08,56.734001,37.166
110,3209e87b652f46439784c4d254dd5d12,55.629002,37.863998
750,331e65f3b6beb0991d91468d2e0f50d2,55.612,37.596001
1230,455564b15a4fea65f12caf304b1b2571,55.806,37.540001
2350,9c8cf84c4bae8d5341f4fd06c964e8ed,59.827,30.214001


In [6]:
df = pd.read_csv("data/sets/tmp/train_set_final.csv", sep=',', encoding='utf-8')
df.head()

Unnamed: 0,amount,city,country,currency,customer_id,home_add_lat,home_add_lon,mcc,terminal_id,work_add_lat,...,has_children,has_pet,amount_mean,amount_sub,amount_div,pct_term_counts,pct_mcc_g_counts,is_abroad,amount_true,is_native
0,2.884034,1,0,643,0dc0137d280a2a82d2dc89282450ff1b,59.851002,30.232,5261,11606fde0c814ce78e0d726e39a0a5ee,59.847,...,0,0,3.552742,-0.668708,0.811777,0.013889,0.138889,0,1914.141006,1
1,2.775633,1,0,643,0dc0137d280a2a82d2dc89282450ff1b,59.851002,30.232,5261,e9647a5e1eacfb06713b6af755ccc595,59.847,...,0,0,3.552742,-0.777109,0.781265,0.013889,0.138889,0,1491.327772,1
2,3.708368,1,0,643,0dc0137d280a2a82d2dc89282450ff1b,59.851002,30.232,5992,df06c1fcd3718a514535ae822785f716,59.847,...,0,0,3.552742,0.155626,1.043805,0.013889,0.013889,0,12773.452874,1
3,2.787498,1,0,643,0dc0137d280a2a82d2dc89282450ff1b,59.851002,30.232,5261,6c5e5793ebc984fb72875feffff62854,59.847,...,0,0,3.552742,-0.765244,0.784605,0.013889,0.138889,0,1532.631571,1
4,2.89251,1,0,643,0dc0137d280a2a82d2dc89282450ff1b,59.851002,30.232,5261,0576445d74e374c92c0902e612fca356,59.847,...,0,0,3.552742,-0.660232,0.814163,0.041667,0.138889,0,1951.865761,1


In [7]:
df = pd.merge(df_valid.loc[:, ["customer_id"]], df, how='left', on='customer_id', suffixes=('', ''))
df['lat_pred'] = df['lon_pred'] = np.nan
df.head()

Unnamed: 0,customer_id,amount,city,country,currency,home_add_lat,home_add_lon,mcc,terminal_id,work_add_lat,...,amount_mean,amount_sub,amount_div,pct_term_counts,pct_mcc_g_counts,is_abroad,amount_true,is_native,lat_pred,lon_pred
0,13d194bdf7adf721eeec5c438879da08,2.70224,0,0,643,56.734001,37.166,5641,1ba8bbab7a1e35baa53bb0d2a3119fe1,,...,2.995777,-0.293538,0.902016,0.0625,0.0625,0,1259.446606,1,,
1,13d194bdf7adf721eeec5c438879da08,2.195717,9,0,643,56.734001,37.166,5814,cab40f7bb974797ecbdeae0507288099,,...,2.995777,-0.80006,0.732937,0.0625,0.3125,0,392.335047,0,,
2,13d194bdf7adf721eeec5c438879da08,2.648815,0,0,643,56.734001,37.166,5912,f301fe6b6925d989b4bae659461188c7,,...,2.995777,-0.346962,0.884183,0.0625,0.3125,0,1113.66652,1,,
3,13d194bdf7adf721eeec5c438879da08,2.243423,0,0,643,56.734001,37.166,5814,6f76119f98fa96f2e2e659e87e14fb5d,,...,2.995777,-0.752355,0.748862,0.125,0.3125,0,437.887709,1,,
4,13d194bdf7adf721eeec5c438879da08,2.336553,0,0,643,56.734001,37.166,5814,6f76119f98fa96f2e2e659e87e14fb5d,,...,2.995777,-0.659225,0.779949,0.125,0.3125,0,542.616041,1,,


In [8]:
def choose_best(group):
    y_pred = group.loc[:, pred_columns].values
    index = np.unravel_index(np.argmax(y_pred, axis=None), y_pred.shape)
    lat, lon = group.iloc[index[0]].loc[["lat", "lon"]]
    lat_mul, lon_mul = multip[index[1]]
    lat += lat_mul * 0.02
    lon += lon_mul * 0.02
    return pd.Series([lat, lon], index=['lat', 'lon'])

In [9]:
features = [f for f in df.columns
    if f not in {'home_add_lon', 'home_add_lat',
                 'work_add_lon', 'work_add_lat',
                 'is_work', 'is_home',
                 'customer_id', 'terminal_id'} and f not in time_features]
features

[u'amount',
 u'city',
 u'country',
 u'currency',
 u'mcc',
 u'access24h',
 u'is_office',
 u'lat',
 u'lon',
 u'n_points',
 u'is_atm',
 u'is_raiff',
 u'is_partner',
 u'pct_same',
 u'neg_home',
 u'neg_work',
 u'pos_home',
 u'pos_work',
 u'mcc_group',
 u'has_car',
 u'has_children',
 u'has_pet',
 u'amount_mean',
 u'amount_sub',
 u'amount_div',
 u'pct_term_counts',
 u'pct_mcc_g_counts',
 u'is_abroad',
 u'amount_true',
 u'is_native',
 'lat_pred',
 'lon_pred']

In [10]:
multip = [(lat_mul, lon_mul) for lat_mul in range(-2, 3) for lon_mul in range(-2, 3)]

# multip = [(0, 0)]

for i, (lat_mul, lon_mul) in enumerate(multip):
    df.loc[:, "lat_pred"] = df.loc[:, "lat"] * lat_mul
    df.loc[:, "lon_pred"] = df.loc[:, "lon"] * lon_mul
        
    y_pred = model.predict(xgb.DMatrix(df.loc[:, features].values))
    df["pred@{}".format(i)] = y_pred
        
    print i, "predicted..."
        
pred_columns = filter(lambda c: '@' in c, df.columns)
df_pred = df[['customer_id', 'lat', 'lon'] + pred_columns].groupby('customer_id').apply(choose_best)
df_pred.reset_index(drop=False, inplace=True)
df_pred.head()

0 predicted...
1 predicted...
2 predicted...
3 predicted...
4 predicted...
5 predicted...
6 predicted...
7 predicted...
8 predicted...
9 predicted...
10 predicted...
11 predicted...
12 predicted...
13 predicted...
14 predicted...
15 predicted...
16 predicted...
17 predicted...
18 predicted...
19 predicted...
20 predicted...
21 predicted...
22 predicted...
23 predicted...
24 predicted...


Unnamed: 0,customer_id,lat,lon
0,0037f3de3d890df1022cc760a1dfd9d6,55.76049,37.782098
1,007525af0bc3ce72137b586a298f7b98,55.769626,37.597295
2,00a840ccc4f35f8ead26610f788a6a61,59.917203,30.337762
3,00cefcb1c777d2bf7a32f082c040088f,59.89952,30.360276
4,00dc51ccd3993f55974bd7afc1ceecea,55.950183,37.502191


In [11]:
df = df.loc[:, filter(lambda c: '@' not in c, df.columns)]

In [12]:
df_pred = pd.merge(df_valid, df_pred, how='left', on='customer_id')

In [13]:
score = is_inside_area(df_pred['home_add_lat'], df_pred['home_add_lon'], df_pred['lat'], df_pred['lon'])
print "Valid score: {}".format(score.sum() / float(score.shape[0]))

Valid score: 0.0107212475634
