In [79]:
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import train_test_split
from catboost import Pool
import numpy as np
import pandas as pd

from catboost import CatBoostClassifier

Обернем все преобразования сделанные в preprocessing.ipynb в pipeline

In [80]:
class DataframeTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, train = True):
        self.train = train

    def fit(self, df):
        return self
    
    def transform(self, df):
        # bd
        median_age = np.median(df[(df['bd'] > 0) & (df['bd'] <= 99)]['bd'])
        df.loc[(df['bd'] <= 0) | (df['bd'] > 99), 'bd'] = median_age

        # song_length
        song_length_age = np.median(df['song_length'])
        df.loc[(df['song_length'] <= 0) | (df['song_length'] > 99), 'bd'] = song_length_age

        # genre_ids
        df['genre_ids'] = df['genre_ids'].fillna('-1')

        df['genre_1'] = df['genre_ids'].apply(lambda x: x.split('|')[0] if len(x.split('|')) > 0 else 'NaN')
        df['genre_2'] = df['genre_ids'].apply(lambda x: x.split('|')[1] if len(x.split('|')) > 1 else 'NaN')
        df['genre_3'] = df['genre_ids'].apply(lambda x: x.split('|')[2] if len(x.split('|')) > 2 else 'NaN')

        df['genre_ids'] = df['genre_ids'].apply(lambda x: list(map(int, x.split('|'))))

        # artist_name
        df['artist_name'] = df['artist_name'].replace('Various Artists', np.nan)

        # language
        df['language'] = df['language'].fillna(52)
        df['language'] = df['language'].apply(lambda x: int(x))

        # most common genres feature
        most_common_genres = df.groupby('msno')['genre_ids'].agg(lambda x: pd.Series(x).explode().value_counts().head(3).index.tolist())
        most_common_genres = most_common_genres.reset_index()

        most_common_genres['most_common_genre1'] = most_common_genres['genre_ids'].apply(lambda x: x[0] if len(x) > 0 else 'NaN')
        most_common_genres['most_common_genre2'] = most_common_genres['genre_ids'].apply(lambda x: x[1] if len(x) > 1 else 'NaN')
        most_common_genres['most_common_genre3'] = most_common_genres['genre_ids'].apply(lambda x: x[2] if len(x) > 2 else 'NaN')

        most_common_genres.drop(columns=['genre_ids'], inplace=True)

        df = pd.merge(df, most_common_genres, on='msno', how='left')

        # most common aritsts feature
        most_common_artists = df.groupby('msno')['artist_name'].agg(lambda x: pd.Series(x).value_counts().head(3).index.tolist())
        most_common_artists = most_common_artists.reset_index()

        most_common_artists['most_common_artist1'] = most_common_artists['artist_name'].apply(lambda x: x[0] if len(x) > 0 else 'NaN')
        most_common_artists['most_common_artist2'] = most_common_artists['artist_name'].apply(lambda x: x[1] if len(x) > 1 else 'NaN')
        most_common_artists['most_common_artist3'] = most_common_artists['artist_name'].apply(lambda x: x[2] if len(x) > 2 else 'NaN')

        most_common_artists.drop(columns=['artist_name'], inplace=True)

        df = pd.merge(df, most_common_artists, on='msno', how='left')

        # most common language feature
        most_common_language = df.groupby('msno')['language'].agg(lambda x: pd.Series(x).value_counts().head(3).index.tolist())
        most_common_language = most_common_language.reset_index()

        most_common_language['most_common_language1'] = most_common_language['language'].apply(lambda x: x[0] if len(x) > 0 else 'NaN')
        most_common_language['most_common_language2'] = most_common_language['language'].apply(lambda x: x[1] if len(x) > 1 else 'NaN')
        most_common_language['most_common_language3'] = most_common_language['language'].apply(lambda x: x[2] if len(x) > 2 else 'NaN')

        most_common_language.drop(columns=['language'], inplace=True)

        df = pd.merge(df, most_common_language, on='msno', how='left')

        columns_to_keep = ['msno', 'song_id', 'source_system_tab', 'source_screen_name', 
                   'source_type', 'city', 'bd', 'gender',
                   'genre_1', 'genre_2', 'genre_3',
                   'most_common_genre1', 'most_common_genre2', 'most_common_genre3',
                   'most_common_artist1', 'most_common_artist2', 'most_common_artist3',
                   'most_common_language1', 'most_common_language2', 'most_common_language3',
                   'song_length', 'artist_name',
                   'language', 'name']
        
        if self.train:
            columns_to_keep = columns_to_keep + ['target']
        
        df = df[columns_to_keep]


        df.fillna('NaN', inplace=True)
        return df

class CatBoostPoolTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, train = True):
        self.train = train
    
    def fit(self, df):
        return self
    
    def transform(self, df):
        cat_features = ['msno', 'song_id', 'source_system_tab', 'source_screen_name', 
                        'source_type', 'city', 'gender',
                        'genre_1', 'genre_2', 'genre_3',
                        'most_common_genre1', 'most_common_genre2', 'most_common_genre3',
                        'most_common_artist1', 'most_common_artist2', 'most_common_artist3',
                        'most_common_language1', 'most_common_language2', 'most_common_language3',
                        'artist_name',
                        'language', 'name']
        if self.train:
            X = df.drop(columns=['target'])
            y = df['target']
        else:
            X = df

        if self.train:
            X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.25, random_state=42)

            train_group_id = X_train['msno'].factorize()[0]
            train_sorted_indices = train_group_id.argsort()

            train_pool = Pool(X_train.iloc[train_sorted_indices].reset_index(drop=True),
                            label = y_train.iloc[train_sorted_indices].reset_index(drop=True),
                            cat_features=cat_features, 
                            group_id=train_group_id[train_sorted_indices])
            
            val_group_id = X_val['msno'].factorize()[0]
            val_sorted_indices = val_group_id.argsort()
            
            val_pool = Pool(X_val.iloc[val_sorted_indices].reset_index(drop=True),
                            label = y_val.iloc[val_sorted_indices].reset_index(drop=True),
                            cat_features=cat_features, 
                            group_id=val_group_id[val_sorted_indices])
            return train_pool, val_pool

        else:
            group_id = X['msno'].factorize()[0]
            sorted_indices = group_id.argsort()
            test_pool = Pool(X.iloc[sorted_indices].reset_index(drop=True),
                            cat_features=cat_features, 
                            group_id=group_id[sorted_indices])
            return test_pool

# Обучение модели

In [81]:
members_df = pd.read_csv('members.csv')
songs_df = pd.read_csv('songs.csv')
songs_extra_info_df = pd.read_csv('song_extra_info.csv')
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')
sample_submission_df = pd.read_csv('sample_submission.csv')

In [82]:
full_song_df = pd.merge(songs_df, songs_extra_info_df, on = 'song_id', how = 'outer')
full_train_df = pd.merge(train_df, members_df, on = 'msno', how = 'left')
full_train_df = pd.merge(full_train_df, full_song_df, on = 'song_id', how = 'left')

In [83]:
train_pipeline = Pipeline([
    ('dataframe_transformer', DataframeTransformer()),
    ('catboost_pool_transformer', CatBoostPoolTransformer()),
])

test_pipeline = Pipeline([
    ('dataframe_transformer', DataframeTransformer(train = False)),
    ('catboost_pool_transformer', CatBoostPoolTransformer(train = False)),
])

In [84]:
train_pool, val_pool = train_pipeline.fit_transform(full_train_df.copy())

  df.fillna('NaN', inplace=True)


In [85]:
full_test_df = pd.merge(test_df, members_df, on = 'msno', how = 'left')
full_test_df = pd.merge(full_test_df, full_song_df, on = 'song_id', how = 'left')

In [86]:
test_pool = test_pipeline.fit_transform(full_test_df.copy())

  df.fillna('NaN', inplace=True)


Берем стандартный для такого рода задач CatBoostClassifier с испольованием метрики указанной в задании. Затем предсказываем вероятности классов, чтобы получить ранжирование по трекам

In [92]:
catboost_model = CatBoostClassifier(iterations=1000,
                                    depth=2,
                                    early_stopping_rounds=10,
                                    loss_function='CrossEntropy',
                                    eval_metric='NDCG:top=20',
                                    verbose=False)

In [93]:
catboost_model.fit(train_pool)

<catboost.core.CatBoostClassifier at 0x1df8f2a6490>

In [94]:
eval_result = catboost_model.eval_metrics(val_pool, metrics=['NDCG:top=20'])

In [105]:
print('NDCG@20 на валидационном датасете = {:0.3f}'.format(eval_result['NDCG:top=20;type=Base'][-1]))

NDCG@20 на валидационном датасете = 0.853


In [98]:
pred = catboost_model.predict_proba(test_pool)

In [100]:
print(pred[:,0])

[0.66133014 0.64446783 0.67013436 ... 0.75501128 0.75997615 0.81831449]
