## Библиотеки 

In [1]:
import numpy as np
import pandas as pd
from collections import Counter

import warnings
warnings.filterwarnings("ignore")

## Функции

In [2]:
# https://github.com/benhamner/Metrics/blob/master/Python/ml_metrics/average_precision.py
def apk(actual, predicted, k=10):
    if len(predicted) > k:
        predicted = predicted[:k]

    score = 0.0
    num_hits = 0.0

    for i, p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i+1.0)

    if not actual:
        return 0.0

    return score / min(len(actual), k)

def mapk(actual, predicted, k=10):
    return np.mean([apk(a, p, k) for a, p in zip(actual, predicted)])

## Получение данных

In [3]:
#df_train = pd.read_csv('/kaggle/input/alfabankchallengedata/df_train.csv', sep=';')
#df_test = pd.read_csv('/kaggle/input/alfabankchallengedata/df_test.csv', sep=';')

In [4]:
df_train = pd.read_csv('data/df_train.csv', sep=';')
df_test = pd.read_csv('data/df_test.csv', sep=';')

In [5]:
df_train['Data'] = df_train.Data.apply(lambda s: list(map(int, s.split(','))))
df_train['Target'] = df_train.Target.apply(lambda s: list(map(int, s.split(','))))
df_test['Data'] = df_test.Data.apply(lambda s: list(map(int, s.split(','))))

In [6]:
df_train.head()

Unnamed: 0,Id,Data,Target
0,0,"[4814, 4814, 6010, 6011, 4814, 6011, 6011, 481...","[4814, 4814, 4814, 4814, 5411, 4814, 4814, 481..."
1,1,"[6011, 6011, 6011, 6011, 6011, 6011, 6011, 481...","[4814, 6011, 4814, 6011, 4814, 4814, 6011, 481..."
2,2,"[8021, 6011, 6011, 6010, 4829, 4814, 6011, 601...","[6011, 6011, 6010, 4829, 4829, 6010, 6011, 601..."
3,3,"[4814, 6011, 4814, 4814, 4814, 6011, 6011, 569...","[6011, 6011, 6010, 6011, 6011, 4814, 4814, 601..."
4,4,"[4814, 4814, 4814, 4814, 4814, 4814, 5946, 481...","[5499, 6011, 4814, 4829, 5200, 5411, 5499, 591..."


In [7]:
df_train.shape

(7033, 3)

In [8]:
df_train.dtypes

Id         int64
Data      object
Target    object
dtype: object

In [9]:
df_test.head()

Unnamed: 0,Id,Data
0,0,"[4814, 4814, 6011, 6011, 6010, 6011, 6011, 481..."
1,1,"[6010, 6011, 6010, 5411, 5411, 5977, 6011, 601..."
2,2,"[4814, 6011, 5251, 6011, 7832, 5641, 5814, 482..."
3,3,"[6011, 4722, 4722, 4722, 4814, 6011, 6011, 482..."
4,4,"[4814, 4814, 4814, 6011, 4814, 4814, 4814, 481..."


In [10]:
df_test.shape

(7033, 2)

In [11]:
df_test.dtypes

Id       int64
Data    object
dtype: object

## Baseline 1: топ10 MCC-кодов из train-части

MCC-код и соответствующее количество вхождений в train-часть

In [12]:
top10_codes = df_train['Data'].explode().value_counts().head(10)
top10_codes

6011    700677
6010    490602
4814    473396
5411    472408
4829    307388
5499    164719
5541     68224
5912     65071
5331     61833
5812     52029
Name: Data, dtype: int64

In [13]:
mapk(df_train['Target'], [top10_codes.index]*len(df_train))

0.2742435829727881

## Baseline 2: cамые популярные транзакции пользователя.

**Если таких менее 10, то замешиваются топ10 популярных из всей выборки.**

In [14]:
def get_top_codes(transactions, top_n=10, drop_from=5):
    transactions_stats = sorted(
        Counter(transactions).items(), 
        key=lambda x: x[1], 
        reverse=True
    )[:top_n]
    

    top_codes = [mcc_code for (mcc_code, count) in transactions_stats if count >= drop_from]
    top_codes += list(top10_codes.index)

    return top_codes[:10]

In [15]:
df_train['pred_baseline_2'] = df_train['Data'].apply(get_top_codes)

In [16]:
df_train

Unnamed: 0,Id,Data,Target,pred_baseline_2
0,0,"[4814, 4814, 6010, 6011, 4814, 6011, 6011, 481...","[4814, 4814, 4814, 4814, 5411, 4814, 4814, 481...","[4814, 6011, 5311, 5411, 6011, 6010, 4814, 541..."
1,1,"[6011, 6011, 6011, 6011, 6011, 6011, 6011, 481...","[4814, 6011, 4814, 6011, 4814, 4814, 6011, 481...","[4814, 6011, 6011, 6010, 4814, 5411, 4829, 549..."
2,2,"[8021, 6011, 6011, 6010, 4829, 4814, 6011, 601...","[6011, 6011, 6010, 4829, 4829, 6010, 6011, 601...","[5814, 6010, 4829, 6011, 4814, 5411, 5331, 599..."
3,3,"[4814, 6011, 4814, 4814, 4814, 6011, 6011, 569...","[6011, 6011, 6010, 6011, 6011, 4814, 4814, 601...","[5411, 4814, 6011, 6010, 6012, 5999, 6011, 601..."
4,4,"[4814, 4814, 4814, 4814, 4814, 4814, 5946, 481...","[5499, 6011, 4814, 4829, 5200, 5411, 5499, 591...","[4814, 5411, 6011, 4829, 5912, 5499, 6010, 520..."
...,...,...,...,...
7028,7028,"[6010, 4829, 6011, 6011, 6011, 6010, 6011, 601...","[4814, 5499, 5499, 5411, 5251, 4814, 5499, 549...","[5251, 4814, 5499, 6011, 6010, 4829, 5411, 601..."
7029,7029,"[4814, 5699, 5641, 5411, 6010, 6011, 4814, 601...","[6011, 5261, 6011, 5261, 4814, 5411, 4814, 601...","[6011, 4814, 5411, 5912, 6010, 5499, 4900, 581..."
7030,7030,"[6011, 6011, 6011, 6011, 6011, 6011, 6011, 601...","[6011, 6011, 6011, 6011, 6011, 6011, 6011, 601...","[6011, 6011, 6010, 4814, 5411, 4829, 5499, 554..."
7031,7031,"[4814, 4814, 5411, 6011, 6011, 4814, 4814, 481...","[4814, 6011, 6011, 5999, 5999, 6011, 6011, 601...","[6011, 5499, 4814, 5999, 6010, 6011, 6010, 481..."


In [17]:
mapk(df_train['Target'], df_train['pred_baseline_2'])

0.3236094127683776

## Submission

предсказания из второго бейзлайна

In [18]:
df_test['Predicted'] = df_test['Data'].apply(get_top_codes)

In [19]:
submission_baseline_2 = df_test[['Id', 'Predicted']]
submission_baseline_2['Predicted'] = submission_baseline_2.Predicted.astype(str).str.replace(',', '')
submission_baseline_2.to_csv('data\submission_baseline_3.csv', index=False)