In [2]:
import pandas as pd
import numpy as np

from xgboost import XGBRegressor



In [3]:
def dcg(r):
    r = np.asfarray(r)[:10]
    if r.size:
        return r[0] + np.sum(r[1:] / np.log2(np.arange(2, r.size + 1)))
    return 0.

def ndcg(r):
    dcg_max = dcg(sorted(r, reverse=True))
    if not dcg_max:        
        return 1.
    return dcg(r) / dcg_max

In [4]:
def calculate_ndcg(X):
    ndcgs = []
    query_ids = np.unique(X['query_id'])
    for item in query_ids:
        ndcgs.append(ndcg(X[X['query_id'] == item]['relevance'].values))
    ndcgs = np.array(ndcgs)
    return np.mean(ndcgs)

In [5]:
from sklearn.model_selection import KFold

In [6]:
def my_cross_val_score(X, algorithm, features, cv=5):
    query_ids = np.unique(X['query_id'])
    kf = KFold(n_splits=cv)
    kf.get_n_splits(query_ids)
    ndcgs = []
    for train_ind, test_ind in kf.split(query_ids):
        X_train = X[X['query_id'].isin(train_ind)]
        X_test = X[X['query_id'].isin(test_ind)]
        
        
        algorithm.fit(X_train[features].values, X_train['relevance'].values)
        
        X_test = X_test.copy()
        
        X_test['predicted_relevance'] = algorithm.predict(X_test[features].values)
        
        ndcgs.append(calculate_ndcg(X_test.sort_values(['query_id', 'predicted_relevance'], 
                                                       ascending=[True, False])))
    return np.array(ndcgs), np.mean(np.array(ndcgs))

In [7]:
train_pool = pd.read_csv('data/train.csv', encoding='utf-8')
test_pool = pd.read_csv('data/test.csv', encoding='utf-8')

## Напишем функции, генерирующие простые признаки основанные на пересечении триграмм между запросом и названием организации

In [8]:
def get_trigrams(string):
    string = '^^' + string + '$$'
    trigrams = set()
    trigrams_count = 0
    
    for i in range(len(string) - 2):
        trigrams.add(string[i:i+3])
        trigrams_count += 1
        
    return trigrams, trigrams_count

def common_trigrams_factors(query, org_name):
    query_trigrams, query_trigrams_count = get_trigrams(query)
    org_name_trigrams, org_name_trigrams_count = get_trigrams(org_name)

    factors = [float(len(query_trigrams.intersection(org_name_trigrams)))]

    factors.append(0. if query_trigrams_count == 0. else 0.1 + factors[0] / query_trigrams_count)
    factors.append(0. if org_name_trigrams_count == 0. else 0.1 + factors[0] / org_name_trigrams_count)    
    
    return factors

Посчитаем данные факторы для каждого файла

## Достанем информацию о координатах объекта и всё запихаем куда надо.

In [9]:
def get_pos_and_coordinate(t):
    return t['pos']['coordinates']

In [10]:
train_org_information = pd.read_json('data/train_org_information.json', orient='columns', 
                                     convert_dates=False, convert_axes=False)
train_org_information = train_org_information.transpose()

train_address = train_org_information['address'].apply(get_pos_and_coordinate).to_frame()
train_address['org_id'] = train_address.index
train_address[['org_id']] = train_address[['org_id']].apply(pd.to_numeric)
train_pool = pd.merge(train_pool, train_address, left_on='org_id', right_on='org_id')

In [11]:
test_org_information = pd.read_json('data/test_org_information.json', orient='columns', 
                                     convert_dates=False, convert_axes=False)
test_org_information = test_org_information.transpose()

test_address = test_org_information['address'].apply(get_pos_and_coordinate).to_frame()
test_address['org_id'] = test_address.index
test_address[['org_id']] = test_address[['org_id']].apply(pd.to_numeric)
test_pool = pd.merge(test_pool, test_address, left_on='org_id', right_on='org_id')

In [12]:
def split_and_take_first(string):
    answer = string.split(',')
    return float(answer[0])

def split_and_take_second(string):
    answer = string.split(',')
    return float(answer[1])

In [13]:
train_pool['window_x'] = train_pool['window_center'].apply(split_and_take_first)
train_pool['window_y'] = train_pool['window_center'].apply(split_and_take_second)
train_pool['org_x'] = train_pool['address'].apply(lambda x: x[0])
train_pool['org_y'] = train_pool['address'].apply(lambda x: x[1])
train_pool['rel_x'] = np.absolute(train_pool['org_x'] - train_pool['window_x'])
train_pool['rel_y'] = np.absolute(train_pool['org_y'] - train_pool['window_y'])
train_pool['size_x'] = train_pool['window_size'].apply(split_and_take_first)
train_pool['size_y'] = train_pool['window_size'].apply(split_and_take_second)
train_pool['rel/size_x'] = train_pool['rel_x'] / train_pool['size_x']
train_pool['rel/size_y'] = train_pool['rel_y'] / train_pool['size_y']
train_pool['distance'] = (train_pool['rel_x'] ** 2 + train_pool['rel_y'] ** 2) ** 0.5

In [14]:
test_pool['window_x'] = test_pool['window_center'].apply(split_and_take_first)
test_pool['window_y'] = test_pool['window_center'].apply(split_and_take_second)
test_pool['org_x'] = test_pool['address'].apply(lambda x: x[0])
test_pool['org_y'] = test_pool['address'].apply(lambda x: x[1])
test_pool['rel_x'] = np.absolute(test_pool['org_x'] - test_pool['window_x'])
test_pool['rel_y'] = np.absolute(test_pool['org_y'] - test_pool['window_y'])
test_pool['size_x'] = test_pool['window_size'].apply(split_and_take_first)
test_pool['size_y'] = test_pool['window_size'].apply(split_and_take_second)
test_pool['rel/size_x'] = test_pool['rel_x'] / test_pool['size_x']
test_pool['rel/size_y'] = test_pool['rel_y'] / test_pool['size_y']
test_pool['distance'] = (test_pool['rel_x'] ** 2 + test_pool['rel_y'] ** 2) ** 0.5

## Развлекаемся c триграммами рубрик.

In [15]:
train_rubric_information = pd.read_json('data/train_rubric_information.json', orient='columns', 
                                        convert_dates=False, convert_axes=False).transpose()

In [16]:
test_rubric_information = pd.read_json('data/test_rubric_information.json', orient='columns', 
                                        convert_dates=False, convert_axes=False).transpose()

In [17]:
def make_one_string_with_org_descr(names):
    good_languages = ['en', 'tr', 'ru', 'uk', 'uz']
    answer = '^^'
    for item in names:
        if item['locale'] in good_languages:
            answer += item['value']
    answer += '$$'
    return answer

In [18]:
def make_one_string_with_org_descr_ugly(descriptions):
    good_languages = ['en', 'tr', 'ru', 'uk', 'uz']
    answer = '^^'
    for item in descriptions:
        item = item['value']
        if item['locale'] in good_languages:
            answer += item['value']
    answer += '$$'
    return answer

In [19]:
train_rubric_information['concat_names'] = train_rubric_information['names'].apply(
    make_one_string_with_org_descr)
train_rubric_information['concat_keywords'] = train_rubric_information['keywords'].apply(
    make_one_string_with_org_descr)
train_rubric_information['concat_phrases'] = train_rubric_information['phrases'].apply(
    make_one_string_with_org_descr)
train_rubric_information['concat_descriptions'] = train_rubric_information['descriptions'].apply(
    make_one_string_with_org_descr_ugly)


test_rubric_information['concat_names'] = test_rubric_information['names'].apply(
    make_one_string_with_org_descr)
test_rubric_information['concat_keywords'] = test_rubric_information['keywords'].apply(
    make_one_string_with_org_descr)
test_rubric_information['concat_phrases'] = test_rubric_information['phrases'].apply(
    make_one_string_with_org_descr)
test_rubric_information['concat_descriptions'] = test_rubric_information['descriptions'].apply(
    make_one_string_with_org_descr_ugly)

In [20]:
train_rubric_information.index = pd.to_numeric(train_rubric_information.index)
train_rubric_information['rubric_id'] = train_rubric_information.index
train_org_information['single_rubric_id'] = train_org_information['rubrics'].apply(lambda x: int(x[0]))

train_org_information.index = pd.to_numeric(train_org_information.index)
train_org_information['org_id'] = train_org_information.index

test_rubric_information.index = pd.to_numeric(test_rubric_information.index)
test_rubric_information['rubric_id'] = test_rubric_information.index
test_org_information['single_rubric_id'] = test_org_information['rubrics'].apply(lambda x: int(x[0]))

test_org_information.index = pd.to_numeric(test_org_information.index)
test_org_information['org_id'] = test_org_information.index

## Триграммы адресов организаций.

In [41]:
train_org_information['string_address'] = train_org_information['address'].apply(
    lambda x: x['formatted']['value'])
test_org_information['string_address'] = test_org_information['address'].apply(
    lambda x: x['formatted']['value']).values

## Сольём организации и train_org_info:

In [42]:
new_train_pool = pd.merge(train_pool, pd.merge(train_org_information, train_rubric_information, 
                                               left_on='single_rubric_id', 
                                               right_on='rubric_id')[['org_id', 'concat_keywords',  
                                                                      'concat_phrases', 'concat_descriptions', 
                                                                      'concat_names', 'string_address']], 
         left_on='org_id', right_on='org_id')

new_test_pool = pd.merge(test_pool, pd.merge(test_org_information, test_rubric_information, 
                                               left_on='single_rubric_id', 
                                               right_on='rubric_id')[['org_id', 'concat_keywords',  
                                                                      'concat_phrases', 'concat_descriptions', 
                                                                      'concat_names', 'string_address']], 
         left_on='org_id', right_on='org_id')


## Все триграммы

In [44]:
def calc_all_trigram_factors(row):
    trigram_factor_strings = ['org_name', 'concat_keywords', 'concat_phrases', 
                              'concat_descriptions', 'concat_names', 'string_address']

    all_factors = np.empty(0)
    for item in trigram_factor_strings:
        all_factors = np.append(all_factors, 
                                np.array(common_trigrams_factors(row.query, row[item])))
    return pd.Series(all_factors)


In [45]:
train_trigram_factors = new_train_pool.apply(calc_all_trigram_factors, axis=1)

test_trigram_factors = new_test_pool.apply(calc_all_trigram_factors, axis=1)

In [46]:
for i in range(train_trigram_factors.columns.size):
    train_pool['Trig' + str(i)] = train_trigram_factors.values[:, i]
    
for i in range(test_trigram_factors.columns.size):
    test_pool['Trig' + str(i)] = test_trigram_factors.values[:, i]

In [80]:
train_pool['query_len'] = train_pool['query'].apply(lambda x: int(len(x) > 15))
test_pool['query_len'] = test_pool['query'].apply(lambda x: int(len(x) > 15))

## Отберём фичи для обучения

In [89]:
features = ['size_x', 'size_y', 'rel/size_x', 'rel/size_y', 'distance', 'query_len']
for i in range(18):
    features.append('Trig' + str(i))

Посчитаем кросс-вал-скор.

In [91]:
my_cross_val_score(train_pool, XGBRegressor(n_estimators=500, max_depth=8), features, cv=10)

(array([ 0.81429095,  0.92988094,  0.92578995,  0.93121501,  0.99113647,
         0.99859876,  0.93090933,  1.        ,  0.8904111 ,  0.869623  ]),
 0.92818555205961017)

In [92]:
test_features = test_pool[features]

In [93]:
train_features = train_pool[features]

Попробуем отхерачить целевую переменную на другие промежутки.

In [None]:
train_pool['other_relevance'] 

Экспериметны!

## Натравим на все это xgboost

In [None]:
clf = XGBRegressor(n_estimators=500, max_depth=8)
clf.fit(train_features.values , train_pool[['relevance']].values)

In [60]:
test_pool['relevance'] = clf.predict(test_features.values)

## Записываем в файл

In [61]:
test_pool.sort_values(['query_id', 'relevance'], 
                      ascending=[True, False])[['query_id', 'org_id']].to_csv('result.csv', index=None)