In [1]:
import pandas as pd
import numpy as np
from sklearn.neighbors import BallTree
from tqdm import tqdm

In [2]:
train_geo = pd.read_csv('train_geo.csv')
train_geo = train_geo[train_geo.label != 0].reset_index(drop=True)
test_geo = pd.read_csv('test_geo.csv')

# train

In [3]:
tree = BallTree(train_geo[['x', 'y']], leaf_size=2) 

In [4]:
dist, ind = tree.query(train_geo[['x', 'y']], k=51)
dist, ind = dist[:, 1:], ind[:, 1:]
dist *= 107
# 107 - constant for conversion to kilometers

In [5]:
dists = []

for i in tqdm(range(train_geo.shape[0])):
    dist_temp_res = np.zeros((4))
    temp_dist = dist[i]
    for i in range(1, 5):
        dist_temp_res[i-1] += len(temp_dist[temp_dist < i])
    dists.append(dist_temp_res)

100%|█████████████████████████████████| 87112/87112 [00:00<00:00, 112788.09it/s]


In [6]:
dists[5]

array([ 5., 14., 26., 34.])

In [9]:
labs = []

for i in tqdm(range(train_geo.shape[0])):
    labs.append(train_geo.iloc[ind[i]].label.values[:10])

100%|██████████████████████████████████| 87112/87112 [00:07<00:00, 11540.55it/s]


In [10]:
labs[5]

array([5, 5, 5, 5, 5, 5, 3, 3, 5, 5])

In [11]:
labs_count = []

for i in tqdm(range(train_geo.shape[0])):
    lc_temp = np.zeros((9))
    temp_dist = dist[i]
    temp_labs = train_geo.iloc[ind[i]].label.values[:10]
    for i in range(1, 10):
        lc_temp[i-1] += len(temp_labs[temp_labs == i])
    labs_count.append(lc_temp)

100%|███████████████████████████████████| 87112/87112 [00:09<00:00, 9479.18it/s]


In [12]:
labs_count[5]

array([0., 0., 2., 0., 8., 0., 0., 0., 0.])

In [13]:
train_nearest_dist = pd.DataFrame(dists, columns=['dist_{}'.format(x) for x in range(1, 5)])
train_nearest = pd.DataFrame(labs, columns=['nearest_{}'.format(x) for x in range(1, 11)], dtype=int)
train_nearest_count = pd.DataFrame(labs_count, columns=['nearest_{}_count'.format(x) for x in range(1, 10)])
train_nearest_count['most_frequent'] = train_nearest_count.apply(lambda x: 1+np.where(x == x.max())[0][0], axis=1)

In [14]:
train_result = pd.concat([train_geo, train_nearest, train_nearest_count, train_nearest_dist], axis=1)
train_result = train_result[train_result.columns[1:]]

In [15]:
train_result

Unnamed: 0,field_id,label,square,x,y,nearest_1,nearest_2,nearest_3,nearest_4,nearest_5,...,nearest_5_count,nearest_6_count,nearest_7_count,nearest_8_count,nearest_9_count,most_frequent,dist_1,dist_2,dist_3,dist_4
0,43134,5,11018,-31.781216,18.337039,5,5,5,3,5,...,6.0,0.0,0.0,0.0,0.0,5,3.0,16.0,32.0,34.0
1,67175,6,25,-31.771337,18.350873,6,2,2,6,5,...,2.0,4.0,0.0,0.0,0.0,6,8.0,16.0,24.0,37.0
2,75612,3,13798,-31.775677,18.328824,3,3,5,3,3,...,5.0,0.0,0.0,0.0,0.0,3,1.0,11.0,29.0,33.0
3,84337,3,1545,-31.766074,18.339741,6,5,1,5,6,...,2.0,5.0,0.0,0.0,0.0,6,5.0,17.0,28.0,33.0
4,88832,5,1980,-31.766351,18.346550,5,2,1,6,6,...,1.0,4.0,0.0,0.0,0.0,6,12.0,17.0,24.0,35.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
87107,56334,2,138,-33.235454,19.239919,2,2,2,2,2,...,0.0,0.0,0.0,0.0,0.0,2,26.0,48.0,50.0,50.0
87108,86618,5,93,-33.242147,19.232390,5,2,2,2,2,...,1.0,0.0,0.0,0.0,0.0,2,10.0,31.0,50.0,50.0
87109,87401,2,30,-33.240038,19.241228,2,2,2,2,2,...,0.0,0.0,0.0,0.0,0.0,2,26.0,50.0,50.0,50.0
87110,98071,2,114,-33.241047,19.236755,2,2,2,2,5,...,2.0,0.0,0.0,0.0,0.0,2,22.0,41.0,50.0,50.0


In [196]:
train_result.to_csv('data/train_geo_2.csv', index=False)

# test

In [16]:
tree = BallTree(train_geo[['x', 'y']], leaf_size=2)

In [19]:
dist, ind = tree.query(test_geo[['x', 'y']], k=50)
dist *= 107
# 107 - constant for conversion to kilometers

In [20]:
ind.shape

(35295, 50)

In [21]:
dists = []

for i in tqdm(range(test_geo.shape[0])):
    dist_temp_res = np.zeros((4))
    temp_dist = dist[i]
    k = 0
    for i in range(1, 5):
        dist_temp_res[k] += len(temp_dist[temp_dist < i])
        k += 1
    dists.append(dist_temp_res)

100%|█████████████████████████████████| 35295/35295 [00:00<00:00, 106117.81it/s]


In [22]:
dists[5000]

array([ 0.,  0.,  7., 50.])

In [23]:
labs = []

for i in tqdm(range(test_geo.shape[0])):
    labs.append(train_geo.iloc[ind[i]].label.values[:10])

100%|██████████████████████████████████| 35295/35295 [00:02<00:00, 11777.00it/s]


In [24]:
labs[5000]

array([7, 2, 7, 7, 7, 7, 7, 7, 7, 7])

In [25]:
labs_count = []

for i in tqdm(range(test_geo.shape[0])):
    lc_temp = np.zeros((9))
    temp_labs = train_geo.iloc[ind[i]].label.values[:10]
    for i in range(1, 10):
        lc_temp[i-1] += len(temp_labs[temp_labs == i])
    labs_count.append(lc_temp)

100%|███████████████████████████████████| 35295/35295 [00:03<00:00, 9516.70it/s]


In [26]:
labs_count[5000]

array([0., 1., 0., 0., 0., 0., 9., 0., 0.])

In [27]:
test_nearest_dist = pd.DataFrame(dists, columns=['dist_{}'.format(x) for x in range(1, 5)])
test_nearest = pd.DataFrame(labs, columns=['nearest_{}'.format(x) for x in range(1, 11)], dtype=int)
test_nearest_count = pd.DataFrame(labs_count, columns=['nearest_{}_count'.format(x) for x in range(1, 10)])
test_nearest_count['most_frequent'] = test_nearest_count.apply(lambda x: 1+np.where(x == x.max())[0][0], axis=1)

In [28]:
test_result = pd.concat([test_geo, test_nearest, test_nearest_count, test_nearest_dist], axis=1)
test_result = test_result[test_result.columns[1:]]

In [195]:
test_result.to_csv('data/test_geo_2.csv', index=False)

In [29]:
test_result

Unnamed: 0,field_id,square,x,y,nearest_1,nearest_2,nearest_3,nearest_4,nearest_5,nearest_6,...,nearest_5_count,nearest_6_count,nearest_7_count,nearest_8_count,nearest_9_count,most_frequent,dist_1,dist_2,dist_3,dist_4
0,2199,398,-33.580269,19.194868,4,4,4,4,4,4,...,0.0,0.0,0.0,0.0,0.0,4,0.0,0.0,30.0,50.0
1,2277,213,-33.573848,19.200675,4,4,4,4,4,4,...,0.0,0.0,0.0,0.0,0.0,4,0.0,0.0,0.0,50.0
2,2571,181,-33.573573,19.204510,4,4,4,4,4,4,...,0.0,0.0,0.0,0.0,0.0,4,0.0,0.0,0.0,50.0
3,3112,120,-33.561649,19.207030,4,4,4,4,4,4,...,0.0,0.0,0.0,0.0,0.0,4,0.0,0.0,7.0,50.0
4,7476,275,-33.575499,19.204510,4,4,4,4,4,4,...,0.0,0.0,0.0,0.0,0.0,4,0.0,0.0,0.0,50.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35290,100931,2053,-32.702577,18.649715,9,1,9,9,9,9,...,0.0,0.0,1.0,0.0,8.0,9,0.0,11.0,32.0,50.0
35291,105031,674,-32.701195,18.645137,9,1,9,9,9,9,...,0.0,0.0,1.0,0.0,8.0,9,2.0,19.0,40.0,50.0
35292,108337,4291,-32.705894,18.640669,9,9,1,7,9,9,...,0.0,0.0,3.0,0.0,6.0,9,0.0,15.0,37.0,50.0
35293,112109,18,-32.693823,18.634784,7,9,7,9,7,9,...,0.0,1.0,4.0,0.0,5.0,9,19.0,44.0,50.0,50.0
