In [1]:
import pandas as pd
import numpy as np

from xgboost import XGBRegressor



In [2]:
def dcg(r):
    r = np.asfarray(r)[:10]
    if r.size:
        return r[0] + np.sum(r[1:] / np.log2(np.arange(2, r.size + 1)))
    return 0.

def ndcg(r):
    dcg_max = dcg(sorted(r, reverse=True))
    if not dcg_max:
        return 0.
    return dcg(r) / dcg_max

In [3]:
train_pool = pd.read_csv('data/train.csv', encoding='utf-8')
test_pool = pd.read_csv('data/test.csv', encoding='utf-8')

Напишем функции, генерирующие простые признаки основанные на пересечении триграмм между запросом и названием организации

In [4]:
def get_trigrams(string):
    string = '^^' + string + '$$'
    trigrams = set()
    trigrams_count = 0
    
    for i in range(len(string) - 2):
        trigrams.add(string[i:i+3])
        trigrams_count += 1
        
    return trigrams, trigrams_count

def common_trigrams_factors(query, org_name):
    query_trigrams, query_trigrams_count = get_trigrams(query)
    org_name_trigrams, org_name_trigrams_count = get_trigrams(org_name)

    factors = [float(len(query_trigrams.intersection(org_name_trigrams)))]

    factors.append(0. if query_trigrams_count == 0. else 0.1 + factors[0] / query_trigrams_count)
    factors.append(0. if org_name_trigrams_count == 0. else 0.1 + factors[0] / org_name_trigrams_count)    
    
    return factors

Посчитаем данные факторы для каждого файла

In [108]:
def calc_trigram_factors(row):
    return pd.Series(common_trigrams_factors(row.query, row.org_name))
    
train_trigram_factors = train_pool.apply(calc_trigram_factors, axis=1)
test_trigram_factors = test_pool.apply(calc_trigram_factors, axis=1)

In [109]:
train_pool['Trig0'] = train_trigram_factors.values[:, 0]
train_pool['Trig1'] = train_trigram_factors.values[:, 1]
train_pool['Trig2'] = train_trigram_factors.values[:, 2]

test_pool['Trig0'] = test_trigram_factors.values[:, 0]
test_pool['Trig1'] = test_trigram_factors.values[:, 1]
test_pool['Trig2'] = test_trigram_factors.values[:, 2]

Впишем в общий пул эти факторы под говорящими названиями.

## Достанем информацию о координатах объекта и всё запихаем куда надо.

In [75]:
def get_pos_and_coordinate(t):
    return t['pos']['coordinates']

In [73]:
train_org_information = pd.read_json('data/train_org_information.json', orient='columns', 
                                     convert_dates=False, convert_axes=False)
train_org_information = train_org_information.transpose()

train_address = train_org_information['address'].apply(get_pos_and_coordinate).to_frame()
train_address['org_id'] = train_address.index
train_address[['org_id']] = train_address[['org_id']].apply(pd.to_numeric)
train_pool = pd.merge(train_pool, train_address, left_on='org_id', right_on='org_id')

In [74]:
test_org_information = pd.read_json('data/test_org_information.json', orient='columns', 
                                     convert_dates=False, convert_axes=False)
test_org_information = test_org_information.transpose()

test_address = test_org_information['address'].apply(get_pos_and_coordinate).to_frame()
test_address['org_id'] = test_address.index
test_address[['org_id']] = test_address[['org_id']].apply(pd.to_numeric)
test_pool = pd.merge(test_pool, test_address, left_on='org_id', right_on='org_id')

In [80]:
def split_and_take_first(string):
    answer = string.split(',')
    return float(answer[0])

def split_and_take_second(string):
    answer = string.split(',')
    return float(answer[1])

In [85]:
train_pool['window_x'] = train_pool['window_center'].apply(split_and_take_first)
train_pool['window_y'] = train_pool['window_center'].apply(split_and_take_second)
train_pool['org_x'] = train_pool['address'].apply(lambda x: x[0])
train_pool['org_y'] = train_pool['address'].apply(lambda x: x[1])
train_pool['rel_x'] = np.absolute(train_pool['org_x'] - train_pool['window_x'])
train_pool['rel_y'] = np.absolute(train_pool['org_y'] - train_pool['window_y'])
train_pool['size_x'] = train_pool['window_size'].apply(split_and_take_first)
train_pool['size_y'] = train_pool['window_size'].apply(split_and_take_second)
train_pool['rel/size_x'] = train_pool['rel_x'] / train_pool['size_x']
train_pool['rel/size_y'] = train_pool['rel_y'] / train_pool['size_y']
train_pool['distance'] = (train_pool['rel_x'] ** 2 + train_pool['rel_y'] ** 2) ** 0.5

In [115]:
test_pool['window_x'] = test_pool['window_center'].apply(split_and_take_first)
test_pool['window_y'] = test_pool['window_center'].apply(split_and_take_second)
test_pool['org_x'] = test_pool['address'].apply(lambda x: x[0])
test_pool['org_y'] = test_pool['address'].apply(lambda x: x[1])
test_pool['rel_x'] = np.absolute(test_pool['org_x'] - test_pool['window_x'])
test_pool['rel_y'] = np.absolute(test_pool['org_y'] - test_pool['window_y'])
test_pool['size_x'] = test_pool['window_size'].apply(split_and_take_first)
test_pool['size_y'] = test_pool['window_size'].apply(split_and_take_second)
test_pool['rel/size_x'] = test_pool['rel_x'] / test_pool['size_x']
test_pool['rel/size_y'] = test_pool['rel_y'] / test_pool['size_y']
test_pool['distance'] = (test_pool['rel_x'] ** 2 + test_pool['rel_y'] ** 2) ** 0.5

In [112]:
train_pool[:4]

Unnamed: 0,query_id,query,region,org_name,org_id,window_center,window_size,relevance,window_x,address,...,rel_x,rel_y,size_x,size_y,rel/size_x,rel/size_y,distance,Trig0,Trig1,Trig2
0,11,"суд, Украина, Днепропетровская область, Днепро...",21775,Суд Жовтневого району міста Дніпропетровськ,1021049127,"34.613119,48.506531","0.025928,0.017380",0.0,34.613119,"[35.053575, 48.456644]",...,0.440456,0.049887,0.025928,0.01738,16.987658,2.870368,0.443272,9.0,0.22,0.3
1,11,"суд, Украина, Днепропетровская область, Днепро...",21775,Дніпропетровський окружний адміністративний суд,1602348889,"34.613119,48.506531","0.025928,0.017380",0.0,34.613119,"[35.005525, 48.429354]",...,0.392406,0.077177,0.025928,0.01738,15.134449,4.440564,0.399923,12.0,0.26,0.344898
2,11,"суд, Украина, Днепропетровская область, Днепро...",21775,Бабушкінський районний суд,1105837793,"34.613119,48.506531","0.025928,0.017380",0.0,34.613119,"[35.046872, 48.463556]",...,0.433753,0.042975,0.025928,0.01738,16.729135,2.47267,0.435877,4.0,0.153333,0.242857
3,11,"суд, Украина, Днепропетровская область, Днепро...",21775,Красногвардійський районний суд,1066267658,"34.613119,48.506531","0.025928,0.017380",0.0,34.613119,"[35.009055, 48.469535]",...,0.395936,0.036996,0.025928,0.01738,15.270595,2.128654,0.397661,4.0,0.153333,0.221212


In [113]:
train_features = train_pool[['size_x', 'size_y', 'rel/size_x', 'rel/size_y', 
                             'distance', 'Trig0', 'Trig1', 'Trig2']]

In [116]:
test_features = test_pool[['size_x', 'size_y', 'rel/size_x', 'rel/size_y', 
                             'distance', 'Trig0', 'Trig1', 'Trig2']]

In [117]:
clf = XGBRegressor(n_estimators=100)
clf.fit(train_features.values , train_pool[['relevance']].values)

XGBRegressor(base_score=0.5, colsample_bylevel=1, colsample_bytree=1, gamma=0,
       learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=100, nthread=-1,
       objective='reg:linear', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1)

In [7]:
test_pool['relevance'] = clf.predict(test_features.values)

In [8]:
test_pool.sort_values(['query_id', 'relevance'], 
                      ascending=[True, False])[['query_id', 'org_id']].to_csv('result.csv', index=None)