In [1]:
from pathlib import Path

import numpy as np
import pandas as pd
from sklearn.metrics import average_precision_score
from sklearn.model_selection import train_test_split

from pandarallel import pandarallel
from tqdm.auto import tqdm
import warnings

warnings.simplefilter(action='ignore', category=FutureWarning)
tqdm.pandas()
pandarallel.initialize(progress_bar=True)

data_dir = Path("../data/")
embeddings_dir = data_dir / "track_embeddings"
N_TAGS = 256
TEST_SIZE = 0.1
SEED = 0
N_THREADS = 4
N_FOLDS = 5
TIMEOUT = 300

INFO: Pandarallel will run on 4 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [3]:
# # LightAutoML presets, task and report generation
# from lightautoml.automl.presets.tabular_presets import TabularAutoML, TabularUtilizedAutoML
# from lightautoml.tasks import Task
# from lightautoml.report.report_deco import ReportDeco


In [2]:
df_train = pd.read_csv(data_dir / 'train.csv')
df_test = pd.read_csv(data_dir / 'test.csv')

In [5]:
def process_tags(tags): 
    tags = list(map(int, tags.split(",")))
    return tags


tags = df_train["tags"].parallel_apply(process_tags)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=12784), Label(value='0 / 12784')))…

In [6]:
tags.map(len).values.max(), tags.map(len).values.min(), tags.map(len).values.mean()

(33, 1, 3.941839089451246)

In [3]:
def get_embedding(track_id):
    embeddings = np.load(embeddings_dir / f"{track_id}.npy")
    # embedding = np.mean(embeddings, axis=0)
    return embeddings #.tolist()

df_test["embedding"] = df_test.track.parallel_apply(get_embedding)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6395), Label(value='0 / 6395'))), …

In [5]:
shapes = []
for emb in df_test["embedding"].values:
    shapes.append(emb.shape[0])

In [6]:
df_test["emb_shape"] = shapes

In [7]:
df_test["emb_shape"].describe()

count    25580.000000
mean        57.925215
std         25.727005
min         10.000000
25%         43.000000
50%         53.000000
75%         68.000000
max        386.000000
Name: emb_shape, dtype: float64

In [10]:
df_train["emb_shape"].describe()

count    51134.000000
mean        58.099934
std         26.633522
min         10.000000
25%         43.000000
50%         53.000000
75%         68.000000
max        404.000000
Name: emb_shape, dtype: float64

In [11]:
df_train["emb_shape"].median()

53.0

In [6]:
len(df_train["embedding"].iloc[0])

768

In [7]:
df_train[[f"f{i}" for i in range(768)]] = pd.DataFrame(df_train.embedding.tolist(), index=df_train.index)

  df_train[[f"f{i}" for i in range(768)]] = pd.DataFrame(df_train.embedding.tolist(), index=df_train.index)
  df_train[[f"f{i}" for i in range(768)]] = pd.DataFrame(df_train.embedding.tolist(), index=df_train.index)
  df_train[[f"f{i}" for i in range(768)]] = pd.DataFrame(df_train.embedding.tolist(), index=df_train.index)
  df_train[[f"f{i}" for i in range(768)]] = pd.DataFrame(df_train.embedding.tolist(), index=df_train.index)
  df_train[[f"f{i}" for i in range(768)]] = pd.DataFrame(df_train.embedding.tolist(), index=df_train.index)
  df_train[[f"f{i}" for i in range(768)]] = pd.DataFrame(df_train.embedding.tolist(), index=df_train.index)
  df_train[[f"f{i}" for i in range(768)]] = pd.DataFrame(df_train.embedding.tolist(), index=df_train.index)
  df_train[[f"f{i}" for i in range(768)]] = pd.DataFrame(df_train.embedding.tolist(), index=df_train.index)
  df_train[[f"f{i}" for i in range(768)]] = pd.DataFrame(df_train.embedding.tolist(), index=df_train.index)
  df_train[[f"f{i}" for i in

In [8]:
def process_tags(tags): 
    tags = list(map(int, tags.split(",")))
    one_hot_tags = np.zeros(N_TAGS, dtype=np.uint8)
    one_hot_tags[tags] = 1
    return one_hot_tags.tolist()


df_train[[f"tag{i}" for i in range(N_TAGS)]] = pd.DataFrame(df_train["tags"].parallel_apply(process_tags).tolist(), index=df_train.index)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=12784), Label(value='0 / 12784')))…

  df_train[[f"tag{i}" for i in range(N_TAGS)]] = pd.DataFrame(df_train["tags"].parallel_apply(process_tags).tolist(), index=df_train.index)
  df_train[[f"tag{i}" for i in range(N_TAGS)]] = pd.DataFrame(df_train["tags"].parallel_apply(process_tags).tolist(), index=df_train.index)
  df_train[[f"tag{i}" for i in range(N_TAGS)]] = pd.DataFrame(df_train["tags"].parallel_apply(process_tags).tolist(), index=df_train.index)
  df_train[[f"tag{i}" for i in range(N_TAGS)]] = pd.DataFrame(df_train["tags"].parallel_apply(process_tags).tolist(), index=df_train.index)
  df_train[[f"tag{i}" for i in range(N_TAGS)]] = pd.DataFrame(df_train["tags"].parallel_apply(process_tags).tolist(), index=df_train.index)
  df_train[[f"tag{i}" for i in range(N_TAGS)]] = pd.DataFrame(df_train["tags"].parallel_apply(process_tags).tolist(), index=df_train.index)
  df_train[[f"tag{i}" for i in range(N_TAGS)]] = pd.DataFrame(df_train["tags"].parallel_apply(process_tags).tolist(), index=df_train.index)
  df_train[[f"tag{i}

In [9]:
# df_train = df_train.drop(["tags", "embedding", "track"], axis=1)

In [10]:
df_train.head()

Unnamed: 0,track,tags,embedding,f0,f1,f2,f3,f4,f5,f6,...,tag246,tag247,tag248,tag249,tag250,tag251,tag252,tag253,tag254,tag255
0,49734,56926325596,"[0.007893690839409828, 0.15074634552001953, 0....",0.007894,0.150746,0.268556,-0.123826,0.41529,-0.180336,0.229614,...,0,0,0,0,0,0,0,0,0,0
1,67845,692839145155,"[0.12421309947967529, 0.18337762355804443, 0.2...",0.124213,0.183378,0.286596,-0.100277,0.409706,-0.236483,0.209988,...,0,0,0,0,0,0,0,0,0,0
2,25302,62840116168,"[0.17797555029392242, 0.07382377982139587, 0.2...",0.177976,0.073824,0.228476,-0.062496,0.303353,-0.166569,0.223461,...,0,0,0,0,0,0,0,0,0,0
3,57796,28186,"[0.17672187089920044, 0.16318991780281067, 0.2...",0.176722,0.16319,0.204621,-0.174427,0.484905,-0.092025,0.21622,...,0,0,0,0,0,0,0,0,0,0
4,13676,623177,"[0.04586225003004074, 0.1584373563528061, 0.16...",0.045862,0.158437,0.168475,-0.011589,0.444241,-0.297196,0.082658,...,0,0,0,0,0,0,0,0,0,0


# Light Auto ML

In [26]:
task = Task(
    name='multilabel',
    metric=lambda y_true, y_pred: average_precision_score(y_true, y_pred)
)
automl = TabularAutoML(
    task, 
    memory_limit=32,
    timeout=60 * 2,
    cpu_limit = N_THREADS,
    reader_params = {'n_jobs': N_THREADS, 'cv': N_FOLDS, 'random_state': SEED},
    general_params = {'use_algos': 'auto'},
)

[14:52:25] CatBoost uses as obj. MultiCrossEntropy.


In [35]:
train_data, val_data = train_test_split(
    df_train,
    test_size=0.98,
    random_state=SEED
)

In [33]:
oof_pred = automl.fit_predict(
    train_data,
    roles={'target': [f"tag{i}" for i in range(N_TAGS)], 'drop': ["tags", "embedding", "track"]},
    verbose=4
)

[14:57:21] Stdout logging level is DEBUG.
[14:57:21] Task: multilabel

[14:57:21] Start automl preset with listed constraints:
[14:57:21] - time: 120.00 seconds
[14:57:21] - CPU: 4 cores
[14:57:21] - memory: 32 GB

[14:57:21] [1mTrain data shape: (256, 1024)[0m



  scores = -(target * np.log(candidates) + (1 - target) * np.log(1 - candidates)).mean(axis=0)
  scores = -(target * np.log(candidates) + (1 - target) * np.log(1 - candidates)).mean(axis=0)
  scores = -(target * np.log(candidates) + (1 - target) * np.log(1 - candidates)).mean(axis=0)
  scores = -(target * np.log(candidates) + (1 - target) * np.log(1 - candidates)).mean(axis=0)
  scores = -(target * np.log(candidates) + (1 - target) * np.log(1 - candidates)).mean(axis=0)
  scores = -(target * np.log(candidates) + (1 - target) * np.log(1 - candidates)).mean(axis=0)
  scores = -(target * np.log(candidates) + (1 - target) * np.log(1 - candidates)).mean(axis=0)
  scores = -(target * np.log(candidates) + (1 - target) * np.log(1 - candidates)).mean(axis=0)
  scores = -(target * np.log(candidates) + (1 - target) * np.log(1 - candidates)).mean(axis=0)
  scores = -(target * np.log(candidates) + (1 - target) * np.log(1 - candidates)).mean(axis=0)
  scores = -(target * np.log(candidates) + (1 - ta

[14:57:59] Feats was rejected during automatic roles guess: []
[14:58:00] Layer [1m1[0m train process start. Time left 80.90 secs
[14:58:01] Start fitting [1mLvl_0_Pipe_0_Mod_0_RFSklearn[0m ...
[14:58:01] Training params: {'bootstrap': True, 'ccp_alpha': 0.0, 'max_depth': None, 'max_features': 'sqrt', 'max_leaf_nodes': None, 'max_samples': None, 'min_samples_leaf': 64, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 500, 'n_jobs': 4, 'oob_score': False, 'random_state': 42, 'warm_start': False, 'criterion': 'gini'}
[14:58:01] ===== Start working with [1mfold 0[0m for [1mLvl_0_Pipe_0_Mod_0_RFSklearn[0m =====
[14:58:08] Model Lvl_0_Pipe_0_Mod_0_RFSklearn failed during ml_algo.fit_predict call.

could not broadcast input array from shape (52,2) into shape (52,)
[14:58:08] Traceback (most recent call last):
  File "/home/and/miniforge3/envs/ya/lib/python3.10/site-packages/lightautoml/ml_algo/utils.py", line 72, in tune_and_fit_predict
    preds = ml_algo.fit

  val_pred = np.moveaxis(np.array(val_pred)[:, :, 1], 1, 0)
  val_pred = np.moveaxis(np.array(val_pred)[:, :, 1], 1, 0)
Trial 0 failed with parameters: {'min_samples_leaf': 96, 'max_depth': 10} because of the following error: ValueError('could not broadcast input array from shape (52,2) into shape (52,)').
Traceback (most recent call last):
  File "/home/and/miniforge3/envs/ya/lib/python3.10/site-packages/optuna/study/_optimize.py", line 200, in _run_trial
    value_or_values = func(trial)
  File "/home/and/miniforge3/envs/ya/lib/python3.10/site-packages/lightautoml/ml_algo/tuning/optuna.py", line 254, in objective
    output_dataset = _ml_algo.fit_predict(train_valid_iterator=train_valid_iterator)
  File "/home/and/miniforge3/envs/ya/lib/python3.10/site-packages/lightautoml/ml_algo/base.py", line 273, in fit_predict
    model, pred = self.fit_predict_single_fold(train, valid)
  File "/home/and/miniforge3/envs/ya/lib/python3.10/site-packages/lightautoml/ml_algo/random_forest.py", line 

[14:58:15] Model Lvl_0_Pipe_0_Mod_1_Tuned_RFSklearn failed during params_tuner.fit call.

could not broadcast input array from shape (52,2) into shape (52,)


AssertionError: Pipeline finished with 0 models for some reason.
Probably one or more models failed

In [None]:
test_pred = automl.predict(df_test)

pd.DataFrame({
    'PassengerId':df_test.PassengerId,
    'Survived': (test_pred.data[:, 0] > 0.5)*1
}).to_csv('submit.csv', index = False)
