In [None]:
# %load_ext blackcellmagic

![](https://idao.world/wp-content/uploads/2018/10/16-%D0%BD%D0%B0-9-1600%D1%85900-768x432.png)

This is the jupyter notebook used for model training and inference for team **Fantastic21** for the **International Data Analysis Olympiad 2019** (https://idao.world/) competition.

Team: **Fantastic21**  
Members:
1. **Phung Cheng Shyong** (chengshyongphung@hotmail.com)
2. **Kok Gin Xian** (xian_kgx@hotmail.com)
3. **Alvin Ting Kee Ngoh**

In [None]:
# Installation of additional packages for running in Google Cloud Platform (GCP)
!pip install imblearn joblib category_encoders lightgbm

Imports

In [None]:
import gc
import math
import os

import numpy as np
import pandas as pd

from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import make_pipeline
from joblib import dump, load

# from sklearn.imputer import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.base import BaseEstimator, TransformerMixin
import category_encoders

import lightgbm as lgb

Setting of environment variables for working with GCP.

In [None]:
PROJECT = "IDAO"
BUCKET = "team469"

os.environ["PROJECT"] = PROJECT
os.environ["BUCKET"] = BUCKET

Data loading

In [None]:
df_train_part_1 = pd.read_csv("data/train_part_1_v2.csv")
df_train_part_2 = pd.read_csv("data/train_part_2_v2.csv")

df_test = pd.read_csv("data/test_public_v2.csv")

df_submission = pd.read_csv("data/sample_submission.csv")

In [None]:
df_train_part_1.set_index("id", inplace=True)
df_train_part_2.set_index("id", inplace=True)

df_test.set_index("id", inplace=True)

df_submission.set_index("id", inplace=True)

In [None]:
# Concatenate train part 1 and train part 2 into a single dataset
df_train = pd.concat([df_train_part_1, df_train_part_2]).reset_index(drop=True)

del df_train_part_1, df_train_part_2

In [None]:
df_train.head()

Categorical features

In [None]:
categorical_features = list(df_test.dtypes[df_test.dtypes == "object"].index)
categorical_features

Excluded features

In [None]:
excluded_features = [
    "label",  # This is what we need to predict
    "particle_type",  # Not in test set
    "weight",  # Not in test set
    "sWeight",  # Not in test set
    "kinWeight",  # Not in test set
]

Custom scikit-learn transformers and utility functions

In [None]:
class RadiusFromCoordinates(BaseEstimator, TransformerMixin):
    """
    Custom scikit-learn transformer for calculating radius from coordinate axes.
    """

    def __init__(
        self,
        # The name of the new column/feature
        feature_name,
        # The coordinate axes for computation of radius
        axis_cols=[],
    ):
        self.feature_name = feature_name
        self.axis_cols = axis_cols

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        accu = np.zeros((len(X),))

        for axis in self.axis_cols:
            accu += X[axis] ** 2

        accu = accu ** 0.5
        accu = pd.Series(accu, name=self.feature_name)

        return pd.concat([X, accu], axis=1)

In [None]:
class AggregatedFeaturesFromArrayFeatures(BaseEstimator, TransformerMixin):
    """
    Custom scikit-learn transformer for computing statistics (min, max, mean, median, size, sum, and variance)
    from array features.
    """
    
    def __init__(self, array_cols=[]):
        self.array_cols = array_cols
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        new_cols = []
        
        for col in self.array_cols:
            splitted = X[col].str.replace("[", "").str.replace("]", "").str.split()
            splitted = list(splitted)
            splitted = [pd.to_numeric(l) for l in splitted]
            
            name = "{}__min".format(col)
            new_cols.append(pd.Series([np.min(l) for l in splitted], name=name, index=X.index))
            name = "{}__max".format(col)
            new_cols.append(pd.Series([np.max(l) for l in splitted], name=name, index=X.index))
            name = "{}__mean".format(col)
            new_cols.append(pd.Series([np.mean(l) for l in splitted], name=name, index=X.index))
            name = "{}__median".format(col)
            new_cols.append(pd.Series([np.median(l) for l in splitted], name=name, index=X.index))
            name = "{}__size".format(col)
            new_cols.append(pd.Series([np.size(l) for l in splitted], name=name, index=X.index))
            name = "{}__sum".format(col)
            new_cols.append(pd.Series([np.sum(l) for l in splitted], name=name, index=X.index))
            name = "{}__var".format(col)
            new_cols.append(pd.Series([np.var(l) for l in splitted], name=name, index=X.index))
#             name = "{}__mode".format(col)
#             new_cols.append(pd.Series([np.mode(l) for l in splitted], name=name, index=X.index))
        
        return pd.concat([X] + new_cols, axis=1)

In [None]:
def enrich_df_with_foi_station_hit_count(df):
    """
    Function for enriching dataframe with various FOI hit count features.
    """

    df["FOI_hit_count_S[0]"] = df["FOI_hits_S"].str.count("0")
    df["FOI_hit_count_S[1]"] = df["FOI_hits_S"].str.count("1")
    df["FOI_hit_count_S[2]"] = df["FOI_hits_S"].str.count("2")
    df["FOI_hit_count_S[3]"] = df["FOI_hits_S"].str.count("3")

    df["FOI_hit_count_sum"] = (
        df["FOI_hit_count_S[0]"]
        + df["FOI_hit_count_S[1]"]
        + df["FOI_hit_count_S[2]"]
        + df["FOI_hit_count_S[3]"]
    )

    df["FOI_hit_count_ratio_S[0]"] = df["FOI_hit_count_S[0]"] / df["FOI_hit_count_sum"]
    df["FOI_hit_count_ratio_S[1]"] = df["FOI_hit_count_S[1]"] / df["FOI_hit_count_sum"]
    df["FOI_hit_count_ratio_S[2]"] = df["FOI_hit_count_S[2]"] / df["FOI_hit_count_sum"]
    df["FOI_hit_count_ratio_S[3]"] = df["FOI_hit_count_S[3]"] / df["FOI_hit_count_sum"]

    df["FOI_hit_count_ratio_S[1][0]"] = (
        df["FOI_hit_count_ratio_S[1]"] / df["FOI_hit_count_ratio_S[0]"]
    )
    df["FOI_hit_count_ratio_S[2][0]"] = (
        df["FOI_hit_count_ratio_S[2]"] / df["FOI_hit_count_ratio_S[0]"]
    )
    df["FOI_hit_count_ratio_S[3][0]"] = (
        df["FOI_hit_count_ratio_S[3]"] / df["FOI_hit_count_ratio_S[0]"]
    )
    df["FOI_hit_count_ratio_S[2][1]"] = (
        df["FOI_hit_count_ratio_S[2]"] / df["FOI_hit_count_ratio_S[1]"]
    )
    df["FOI_hit_count_ratio_S[3][1]"] = (
        df["FOI_hit_count_ratio_S[3]"] / df["FOI_hit_count_ratio_S[1]"]
    )
    df["FOI_hit_count_ratio_S[3][2]"] = (
        df["FOI_hit_count_ratio_S[3]"] / df["FOI_hit_count_ratio_S[2]"]
    )
    
    return df

In [None]:
def enrich_df_with_Lextra_Matched_Hit_Vector(df):
    """
    Function for enriching dataframe with vector components of MatchedHit to Lextra vector at various stations.
    """

    df["Lextra_Matched_Hit_Vector_X[0]"] = df["Lextra_X[0]"] - df["MatchedHit_X[0]"]
    df["Lextra_Matched_Hit_Vector_X[1]"] = df["Lextra_X[1]"] - df["MatchedHit_X[1]"]
    df["Lextra_Matched_Hit_Vector_X[2]"] = df["Lextra_X[2]"] - df["MatchedHit_X[2]"]
    df["Lextra_Matched_Hit_Vector_X[3]"] = df["Lextra_X[3]"] - df["MatchedHit_X[3]"]

    df["Lextra_Matched_Hit_Vector_Y[0]"] = df["Lextra_Y[0]"] - df["MatchedHit_Y[0]"]
    df["Lextra_Matched_Hit_Vector_Y[1]"] = df["Lextra_Y[1]"] - df["MatchedHit_Y[1]"]
    df["Lextra_Matched_Hit_Vector_Y[2]"] = df["Lextra_Y[2]"] - df["MatchedHit_Y[2]"]
    df["Lextra_Matched_Hit_Vector_Y[3]"] = df["Lextra_Y[3]"] - df["MatchedHit_Y[3]"]
    
    return df

In [None]:
def two_vector_angle_and_magnitudes(
    # Vector 1
    x_dst_1,
    y_dst_1,
    z_dst_1,
    x_src_1,
    y_src_1,
    z_src_1,
    # Vector 2
    x_dst_2,
    y_dst_2,
    z_dst_2,
    x_src_2,
    y_src_2,
    z_src_2,
):
    """
    Function for computing the angle between two vectors and their magnitudes.
    """

    dx_1 = x_dst_1 - x_src_1
    dy_1 = y_dst_1 - y_src_1
    dz_1 = z_dst_1 - z_src_1

    dx_2 = x_dst_2 - x_src_2
    dy_2 = y_dst_2 - y_src_2
    dz_2 = z_dst_2 - z_src_2

    v_1_dot_v_2 = np.array(
        [
            np.dot([a, b, c], [e, f, g])
            for a, b, c, e, f, g in zip(dx_1, dy_1, dz_1, dx_2, dy_2, dz_2)
        ]
    )

    v_1_magnitude = (dx_1 ** 2 + dy_1 ** 2 + dz_1 ** 2) ** 0.5
    v_2_magnitude = (dx_2 ** 2 + dy_2 ** 2 + dz_2 ** 2) ** 0.5
    angle = np.arccos(v_1_dot_v_2 / (v_1_magnitude * v_2_magnitude))

    return angle, v_1_magnitude, v_2_magnitude

In [None]:
class FeaturesDropper(BaseEstimator, TransformerMixin):
    """
    Custom scikit-learn transformer for dropping of features/columns from a dataframe.
    """

    def __init__(self, columns=[]):
        self.columns = columns

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        return X.drop(self.columns, axis=1)

In [None]:
class LeftJoin(BaseEstimator, TransformerMixin):
    """
    Custom scikit-learn transformer for performing a left join between two dataframes.
    """

    def __init__(self, csv_path, on=None, left_on=None, right_on=None):
        self.csv_path = csv_path

        if on and (left_on or right_on):
            raise Exception("Both on and left/right on set!")
        if (left_on and not right_on) or (not left_on and right_on):
            raise Exception("Only left or right on set!")
        if not on and not left_on and not right_on:
            raise Exception("Neither on nor left_on and right_on set!")

        self.on = on
        self.left_on = left_on
        self.right_on = right_on

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        X_right = pd.read_csv(self.csv_path)

        if self.on:
            return X.merge(X_right, left_on=self.left_on, right_on=self.right_on)
        elif self.left_on and self.right_on:
            return X.merge(X_right, on=self.on)

In [None]:
class ColumnRenamer(BaseEstimator, TransformerMixin):
    """
    Custom scikit-learn transformer for renaming a column in a dataframe.
    """
    
    def __init__(self, col, col_new_name):
        self.col = col
        self.col_new_name = col_new_name
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        return X.rename(columns={
            self.col: self.col_new_name
        })

In [None]:
class DivideTwoFeatures(BaseEstimator, TransformerMixin):
    """
    Custom scikit-learn transformer for creating a new column/feature obtained by dividing two columns/features in a dataframe.
    """

    def __init__(self, col1, col2, new_col_name):
        self.col1 = col1
        self.col2 = col2
        self.new_col_name = new_col_name

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        tmp = pd.Series(X[self.col1] / X[self.col2], name=self.new_col_name)
        return pd.concat([X, tmp], axis=1)

In [None]:
class MultiplyFeatures(BaseEstimator, TransformerMixin):
    """
    Custom scikit-learn transformer for creating a new column/feature that is the multiplication of a list of
    columns/features in a dataframe.
    """

    def __init__(self, new_col_name, columns):
        self.new_col_name = new_col_name
        self.columns = columns

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        accu = np.ones((len(X),))

        for col in self.columns:
            accu *= X[col]

        accu = pd.Series(accu, name=self.new_col_name)

        return pd.concat([X, accu], axis=1)

In [None]:
class MultiColumnSum(BaseEstimator, TransformerMixin):
    """
    Custom scikit-learn transformer for creating a new column/feature that is the sum of a alist of columns/features
    in a dataframe.
    """

    def __init__(self, new_col_name, columns):
        self.columns = columns
        self.new_col_name = new_col_name

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        accu = np.zeros((len(X),))

        for col in self.columns:
            accu += X[col]

        accu = pd.Series(accu, name=self.new_col_name)
        return pd.concat([X, accu], axis=1)

In [None]:
# Angle and magnitude of coordinates
def enrich_df_with_matchhit_angle_and_radius_at_station_3D(df):
    for i in range(4):
        df["MatchedHit_vector_angle_Z[{}]".format(i)], df[
            "MatchHit_R_from_origin_S[{}]".format(i)
        ], _ = two_vector_angle_and_magnitudes(
            df["MatchedHit_X[{}]".format(i)].values,
            df["MatchedHit_Y[{}]".format(i)].values,
            df["MatchedHit_Z[{}]".format(i)].values,
            np.zeros_like(df["MatchedHit_X[{}]".format(i)]),
            np.zeros_like(df["MatchedHit_Y[{}]".format(i)]),
            np.zeros_like(df["MatchedHit_Z[{}]".format(i)]),
            np.zeros_like(df["MatchedHit_X[{}]".format(i)]),
            np.zeros_like(df["MatchedHit_Y[{}]".format(i)]),
            df["MatchedHit_Z[{}]".format(i)].values,
            np.zeros_like(df["MatchedHit_X[{}]".format(i)]),
            np.zeros_like(df["MatchedHit_Y[{}]".format(i)]),
            np.zeros_like(df["MatchedHit_Z[{}]".format(i)]),
        )

        df[
            "MatchedHit_vector_angle_Y[{}]".format(i)
        ], _, _ = two_vector_angle_and_magnitudes(
            df["MatchedHit_X[{}]".format(i)].values,
            df["MatchedHit_Y[{}]".format(i)].values,
            df["MatchedHit_Z[{}]".format(i)].values,
            np.zeros_like(df["MatchedHit_X[{}]".format(i)]),
            np.zeros_like(df["MatchedHit_Y[{}]".format(i)]),
            np.zeros_like(df["MatchedHit_Z[{}]".format(i)]),
            np.zeros_like(df["MatchedHit_X[{}]".format(i)]),
            df["MatchedHit_Y[{}]".format(i)].values,
            np.zeros_like(df["MatchedHit_Z[{}]".format(i)]),
            np.zeros_like(df["MatchedHit_X[{}]".format(i)]),
            np.zeros_like(df["MatchedHit_Y[{}]".format(i)]),
            np.zeros_like(df["MatchedHit_Z[{}]".format(i)]),
        )

        df[
            "MatchedHit_vector_angle_X[{}]".format(i)
        ], _, _ = two_vector_angle_and_magnitudes(
            df["MatchedHit_X[{}]".format(i)].values,
            df["MatchedHit_Y[{}]".format(i)].values,
            df["MatchedHit_Z[{}]".format(i)].values,
            np.zeros_like(df["MatchedHit_X[{}]".format(i)]),
            np.zeros_like(df["MatchedHit_Y[{}]".format(i)]),
            np.zeros_like(df["MatchedHit_Z[{}]".format(i)]),
            df["MatchedHit_X[{}]".format(i)].values,
            np.zeros_like(df["MatchedHit_Y[{}]".format(i)]),
            np.zeros_like(df["MatchedHit_Z[{}]".format(i)]),
            np.zeros_like(df["MatchedHit_X[{}]".format(i)]),
            np.zeros_like(df["MatchedHit_Y[{}]".format(i)]),
            np.zeros_like(df["MatchedHit_Z[{}]".format(i)]),
        )

    df["MatchedHit_vector_angle_Z_sum"] = (
        df["MatchedHit_vector_angle_Z[0]"]
        + df["MatchedHit_vector_angle_Z[1]"]
        + df["MatchedHit_vector_angle_Z[2]"]
        + df["MatchedHit_vector_angle_Z[3]"]
    )
    df["MatchedHit_vector_angle_Y_sum"] = (
        df["MatchedHit_vector_angle_Y[0]"]
        + df["MatchedHit_vector_angle_Y[1]"]
        + df["MatchedHit_vector_angle_Y[2]"]
        + df["MatchedHit_vector_angle_Y[3]"]
    )
    df["MatchedHit_vector_angle_X_sum"] = (
        df["MatchedHit_vector_angle_X[0]"]
        + df["MatchedHit_vector_angle_X[1]"]
        + df["MatchedHit_vector_angle_X[2]"]
        + df["MatchedHit_vector_angle_X[3]"]
    )
    df["MatchHit_R_from_origin_S_sum"] = (
        df["MatchHit_R_from_origin_S[0]"]
        + df["MatchHit_R_from_origin_S[1]"]
        + df["MatchHit_R_from_origin_S[2]"]
        + df["MatchHit_R_from_origin_S[3]"]
    )
    for i in range(4):
        df["MatchedHit_vector_angle_Z_ratio[{}]".format(i)] = (
            df["MatchedHit_vector_angle_Z[{}]".format(i)]
            / df["MatchedHit_vector_angle_Z_sum"]
        )
        df["MatchedHit_vector_angle_Y_ratio[{}]".format(i)] = (
            df["MatchedHit_vector_angle_Y[{}]".format(i)]
            / df["MatchedHit_vector_angle_Y_sum"]
        )
        df["MatchedHit_vector_angle_X_ratio[{}]".format(i)] = (
            df["MatchedHit_vector_angle_X[{}]".format(i)]
            / df["MatchedHit_vector_angle_X_sum"]
        )

    for s1, s2 in [(0, 1), (0, 2), (0, 3), (1, 2), (1, 3), (2, 3)]:
        df["MatchedHit_vector_angle_Z_diff_[{},{}]".format(s2, s1)] = (
            df["MatchedHit_vector_angle_Z[{}]".format(s2)]
            - df["MatchedHit_vector_angle_Z[{}]".format(s1)]
        )
        df["MatchedHit_vector_angle_Z_ratio_[{},{}]".format(s2, s1)] = (
            df["MatchedHit_vector_angle_Z[{}]".format(s2)]
            / df["MatchedHit_vector_angle_Z[{}]".format(s1)]
        )
        df["MatchedHit_vector_angle_Z[{}>{}]".format(s2, s1)] = (
            df["MatchedHit_vector_angle_Z[{}]".format(s2)]
            > df["MatchedHit_vector_angle_Z[{}]".format(s1)]
        ).astype("int")
        df["MatchedHit_vector_angle_Y_diff_[{},{}]".format(s2, s1)] = (
            df["MatchedHit_vector_angle_Y[{}]".format(s2)]
            - df["MatchedHit_vector_angle_Y[{}]".format(s1)]
        )
        df["MatchedHit_vector_angle_Y_ratio_[{},{}]".format(s2, s1)] = (
            df["MatchedHit_vector_angle_Y[{}]".format(s2)]
            / df["MatchedHit_vector_angle_Y[{}]".format(s1)]
        )
        df["MatchedHit_vector_angle_Y[{}>{}]".format(s2, s1)] = (
            df["MatchedHit_vector_angle_Y[{}]".format(s2)]
            > df["MatchedHit_vector_angle_Y[{}]".format(s1)]
        ).astype("int")
        df["MatchedHit_vector_angle_X_diff_[{},{}]".format(s2, s1)] = (
            df["MatchedHit_vector_angle_X[{}]".format(s2)]
            - df["MatchedHit_vector_angle_X[{}]".format(s1)]
        )
        df["MatchedHit_vector_angle_X_ratio_[{},{}]".format(s2, s1)] = (
            df["MatchedHit_vector_angle_X[{}]".format(s2)]
            / df["MatchedHit_vector_angle_X[{}]".format(s1)]
        )
        df["MatchedHit_vector_angle_X[{}>{}]".format(s2, s1)] = (
            df["MatchedHit_vector_angle_X[{}]".format(s2)]
            > df["MatchedHit_vector_angle_X[{}]".format(s1)]
        ).astype("int")

    return df

In [None]:
# Angle and magnitude of coordinates between two stations
def enrich_df_with_matchhit_angle_and_radius_between_two_stations(
    df, src_station, dst_station
):
    df[
        "MatchHit_angle_between_stations_Z_[{},{}]".format(src_station, dst_station)
    ], df[
        "MatchHit_R_between_stations_[{},{}]".format(src_station, dst_station)
    ], _ = two_vector_angle_and_magnitudes(
        df["MatchedHit_X[{}]".format(dst_station)].values,
        df["MatchedHit_Y[{}]".format(dst_station)].values,
        df["MatchedHit_Z[{}]".format(dst_station)].values,
        df["MatchedHit_X[{}]".format(src_station)].values,
        df["MatchedHit_Y[{}]".format(src_station)].values,
        df["MatchedHit_Z[{}]".format(src_station)].values,
        np.zeros_like(df["MatchedHit_X[{}]".format(dst_station)]),
        np.zeros_like(df["MatchedHit_Y[{}]".format(dst_station)]),
        df["MatchedHit_Z[{}]".format(dst_station)].values,
        np.zeros_like(df["MatchedHit_X[{}]".format(src_station)]),
        np.zeros_like(df["MatchedHit_Y[{}]".format(src_station)]),
        df["MatchedHit_Z[{}]".format(src_station)].values,
    )

    df[
        "MatchHit_angle_between_stations_Y_[{},{}]".format(src_station, dst_station)
    ], _, _ = two_vector_angle_and_magnitudes(
        df["MatchedHit_X[{}]".format(dst_station)].values,
        df["MatchedHit_Y[{}]".format(dst_station)].values,
        df["MatchedHit_Z[{}]".format(dst_station)].values,
        df["MatchedHit_X[{}]".format(src_station)].values,
        df["MatchedHit_Y[{}]".format(src_station)].values,
        df["MatchedHit_Z[{}]".format(src_station)].values,
        np.zeros_like(df["MatchedHit_X[{}]".format(dst_station)]),
        df["MatchedHit_Y[{}]".format(dst_station)].values,
        np.zeros_like(df["MatchedHit_Z[{}]".format(dst_station)]),
        np.zeros_like(df["MatchedHit_X[{}]".format(src_station)]),
        df["MatchedHit_Y[{}]".format(src_station)].values,
        np.zeros_like(df["MatchedHit_Z[{}]".format(dst_station)]),
    )

    df[
        "MatchHit_angle_between_stations_X_[{},{}]".format(src_station, dst_station)
    ], _, _ = two_vector_angle_and_magnitudes(
        df["MatchedHit_X[{}]".format(dst_station)].values,
        df["MatchedHit_Y[{}]".format(dst_station)].values,
        df["MatchedHit_Z[{}]".format(dst_station)].values,
        df["MatchedHit_X[{}]".format(src_station)].values,
        df["MatchedHit_Y[{}]".format(src_station)].values,
        df["MatchedHit_Z[{}]".format(src_station)].values,
        df["MatchedHit_X[{}]".format(dst_station)].values,
        np.zeros_like(df["MatchedHit_Y[{}]".format(dst_station)]),
        np.zeros_like(df["MatchedHit_Z[{}]".format(dst_station)]),
        df["MatchedHit_X[{}]".format(src_station)].values,
        np.zeros_like(df["MatchedHit_Y[{}]".format(dst_station)]),
        np.zeros_like(df["MatchedHit_Z[{}]".format(dst_station)]),
    )

    return df

In [None]:
# Polar coordinates conversion
def enrich_df_with_matchedhit_radial_features(df):
    for a1, a2 in [("Y", "X"), ("Y", "Z"), ("X", "Z")]:
        for i in range(4):
            df["MatchedHit_angle_{}{}[{}]".format(a1, a2, i)] = np.arctan2(
                df["MatchedHit_{}[{}]".format(a1, i)],
                df["MatchedHit_{}[{}]".format(a2, i)],
            )
            # df["MatchedHit_modulus_{}{}[{}]".format(a1, a2, i)] = np.linalg.norm([df["MatchedHit_{}[{}]".format(a1, i)], df["MatchedHit_{}[{}]".format(a2, i)]])
            # df["MatchedHit_sign_{}{}[{}]".format(a1, a2, i)] =np.where((np.arctan2(df["MatchedHit_{}[{}]".format(a1, i)], df["MatchedHit_{}[{}]".format(a2, i)])>=0), 1, -1)
            try:
                df["Lextra_angle_{}{}[{}]".format(a1, a2, i)] = np.arctan2(
                    df["Lextra_{}[{}]".format(a1, i)], df["Lextra_{}[{}]".format(a2, i)]
                )
                # df["Lextra_modulus_{}{}[{}]".format(a1, a2, i)] = np.linalg.norm([df["Lextra_{}[{}]".format(a1, i)], df["Lextra_{}[{}]".format(a2, i)]])
                # df["Lextra_sign_{}{}[{}]".format(a1, a2, i)] =np.where((np.arctan2(df["Lextra_{}[{}]".format(a1, i)], df["Lextra_{}[{}]".format(a2, i)])>=0), 1, -1)
            except:
                pass

        df["MatchedHit_angle_sum_{}{}".format(a1, a2)] = (
            df["MatchedHit_angle_{}{}[0]".format(a1, a2)]
            + df["MatchedHit_angle_{}{}[1]".format(a1, a2)]
            + df["MatchedHit_angle_{}{}[2]".format(a1, a2)]
            + df["MatchedHit_angle_{}{}[3]".format(a1, a2)]
        )
        # df["MatchedHit_modulus_sum_{}{}".format(a1, a2)] = df["MatchedHit_modulus_{}{}[0]".format(a1, a2)] + df["MatchedHit_modulus_{}{}[1]".format(a1, a2)] + df["MatchedHit_modulus_{}{}[2]".format(a1, a2)] + df["MatchedHit_modulus_{}{}[3]".format(a1, a2)]
        try:
            df["Lextra_angle_sum_{}{}".format(a1, a2)] = (
                df["Lextra_angle_{}{}[0]".format(a1, a2)]
                + df["Lextra_angle_{}{}[1]".format(a1, a2)]
                + df["Lextra_angle_{}{}[2]".format(a1, a2)]
                + df["Lextra_angle_{}{}[3]".format(a1, a2)]
            )
            # df["Lextra_modulus_sum_{}{}".format(a1, a2)] = df["Lextra_modulus_{}{}[0]".format(a1, a2)] + df["Lextra_modulus_{}{}[1]".format(a1, a2)] + df["Lextra_modulus_{}{}[2]".format(a1, a2)] + df["Lextra_modulus_{}{}[3]".format(a1, a2)]
        except:
            pass

        for i in range(4):
            df["MatchedHit_angle_ratio_{}{}[{}]".format(a1, a2, i)] = (
                df["MatchedHit_angle_{}{}[{}]".format(a1, a2, i)]
                / df["MatchedHit_angle_sum_{}{}".format(a1, a2)]
            )
            # df["MatchedHit_modulus_ratio_{}{}[{}]".format(a1, a2, i)] = df["MatchedHit_modulus_{}{}[{}]".format(a1, a2, i)] / df["MatchedHit_modulus_sum_{}{}".format(a1, a2)]
            try:
                df["Lextra_angle_ratio_{}{}[{}]".format(a1, a2, i)] = (
                    df["Lextra_angle_{}{}[{}]".format(a1, a2, i)]
                    / df["Lextra_angle_sum_{}{}".format(a1, a2)]
                )
                # df["Lextra_angle_modulus_{}{}[{}]".format(a1, a2, i)] = df["Lextra_modulus_{}{}[{}]".format(a1, a2, i)] / df["Lextra_modulus_sum_{}{}".format(a1, a2)]
            except:
                pass

        for s1, s2 in [(0, 1), (0, 2), (0, 3), (1, 2), (1, 3), (2, 3)]:
            df["MatchedHit_angle_diff_{}{}[{},{}]".format(a1, a2, s2, s1)] = (
                df["MatchedHit_angle_{}{}[{}]".format(a1, a2, s2)]
                - df["MatchedHit_angle_{}{}[{}]".format(a1, a2, s1)]
            )
            df["MatchedHit_angle_ratio_{}{}[{},{}]".format(a1, a2, s2, s1)] = (
                df["MatchedHit_angle_{}{}[{}]".format(a1, a2, s2)]
                / df["MatchedHit_angle_{}{}[{}]".format(a1, a2, s1)]
            )
            df["MatchedHit_angle_{}{}[{}>{}]".format(a1, a2, s2, s1)] = (
                df["MatchedHit_angle_{}{}[{}]".format(a1, a2, s2)]
                > df["MatchedHit_angle_{}{}[{}]".format(a1, a2, s1)]
            ).astype("int")
            # df["MatchedHit_modulus_diff_{}{}[{},{}]".format(a1, a2, s2, s1)] = df["MatchedHit_modulus_{}{}[{}]".format(a1, a2, s2)] -df["MatchedHit_modulus_{}{}[{}]".format(a1, a2, s1)]
            # df["MatchedHit_modulus_ratio_{}{}[{},{}]".format(a1, a2, s2, s1)] = df["MatchedHit_modulus_{}{}[{}]".format(a1, a2, s2)] /df["MatchedHit_modulus_{}{}[{}]".format(a1, a2, s1)]
            # df["MatchedHit_modulus_{}{}[{}>{}]".format(a1, a2, s2, s1)] = (df["MatchedHit_modulus_{}{}[{}]".format(a1, a2, s2)] > df["MatchedHit_modulus_{}{}[{}]".format(a1, a2, s1)]).astype("int")
            # df["MatchedHit_sign_ratio_{}{}[{},{}]".format(a1, a2, s2, s1)] = df["MatchedHit_sign_{}{}[{}]".format(a1, a2, s2)] /df["MatchedHit_sign_{}{}[{}]".format(a1, a2, s1)]

            try:
                df["Lextra_angle_diff_{}{}[{},{}]".format(a1, a2, s2, s1)] = (
                    df["Lextra_angle_{}{}[{}]".format(a1, a2, s2)]
                    - df["Lextra_angle_{}{}[{}]".format(a1, a2, s1)]
                )
                df["Lextra_angle_ratio_{}{}[{},{}]".format(a1, a2, s2, s1)] = (
                    df["Lextra_angle_{}{}[{}]".format(a1, a2, s2)]
                    / df["Lextra_angle_{}{}[{}]".format(a1, a2, s1)]
                )
                df["Lextra_angle_{}{}[{}>{}]".format(a1, a2, s2, s1)] = (
                    df["Lextra_angle_{}{}[{}]".format(a1, a2, s2)]
                    > df["Lextra_angle_{}{}[{}]".format(a1, a2, s1)]
                ).astype("int")
                # df["Lextra_modulus_diff_{}{}[{},{}]".format(a1, a2, s2, s1)] = df["Lextra_modulus_{}{}[{}]".format(a1, a2, s2)] - df["Lextra_modulus_{}{}[{}]".format(a1, a2, s1)]
                # df["Lextra_modulus_ratio_{}{}[{},{}]".format(a1, a2, s2, s1)] = df["Lextra_modulus_{}{}[{}]".format(a1, a2, s2)]/df["Lextra_modulus_{}{}[{}]".format(a1, a2, s1)]
                # df["Lextra_modulus_{}{}[{}>{}]".format(a1, a2, s2, s1)] = (df["Lextra_modulus_{}{}[{}]".format(a1, a2, s2)] > df["Lextra_modulus_{}{}[{}]".format(a1, a2, s1)]).astype("int")
                # df["Lextra_sign_ratio_{}{}[{},{}]".format(a1, a2, s2, s1)] = df["Lextra_sign_{}{}[{}]".format(a1, a2, s2)]/df["Lextra_sign_{}{}[{}]".format(a1, a2, s1)]

            except:
                pass

    return df

In [None]:
# Polar coordinates conversion of the lectra Matched Hit Vector
def enrich_df_with_Lextra_matchedhit_radial_features(df):
    for a1, a2 in [("Y", "X")]:
        for i in range(4):
            df[
                "Lextra_Matched_Hit_Vector_angle_{}{}[{}]".format(a1, a2, i)
            ] = np.arctan2(
                df["Lextra_Matched_Hit_Vector_{}[{}]".format(a1, i)],
                df["Lextra_Matched_Hit_Vector_{}[{}]".format(a2, i)],
            )

        df["Lextra_Matched_Hit_Vector_angle_sum_{}{}".format(a1, a2)] = (
            df["Lextra_Matched_Hit_Vector_angle_{}{}[0]".format(a1, a2)]
            + df["Lextra_Matched_Hit_Vector_angle_{}{}[1]".format(a1, a2)]
            + df["MatchedHit_angle_{}{}[2]".format(a1, a2)]
            + df["MatchedHit_angle_{}{}[3]".format(a1, a2)]
        )

        for i in range(4):
            df["Lextra_Matched_Hit_Vector_ratio_{}{}[{}]".format(a1, a2, i)] = (
                df["Lextra_Matched_Hit_Vector_angle_{}{}[{}]".format(a1, a2, i)]
                / df["Lextra_Matched_Hit_Vector_angle_sum_{}{}".format(a1, a2)]
            )

        for s1, s2 in [(0, 1), (0, 2), (0, 3), (1, 2), (1, 3), (2, 3)]:
            df[
                "Lextra_Matched_Hit_Vector_angle_diff_{}{}[{},{}]".format(
                    a1, a2, s2, s1
                )
            ] = (
                df["Lextra_Matched_Hit_Vector_angle_{}{}[{}]".format(a1, a2, s2)]
                - df["Lextra_Matched_Hit_Vector_angle_{}{}[{}]".format(a1, a2, s1)]
            )
            df[
                "Lextra_Matched_Hit_Vector_angle_ratio_{}{}[{},{}]".format(
                    a1, a2, s2, s1
                )
            ] = (
                df["Lextra_Matched_Hit_Vector_angle_{}{}[{}]".format(a1, a2, s2)]
                / df["Lextra_Matched_Hit_Vector_angle_{}{}[{}]".format(a1, a2, s1)]
            )
            df["Lextra_Matched_Hit_Vector_angle_{}{}[{}>{}]".format(a1, a2, s2, s1)] = (
                df["Lextra_Matched_Hit_Vector_angle_{}{}[{}]".format(a1, a2, s2)]
                > df["Lextra_Matched_Hit_Vector_angle_{}{}[{}]".format(a1, a2, s1)]
            ).astype("int")

    return df

In [None]:
# Deriving information from time feature
def enrich_df_with_matchedhit_time_features(df):
    df["MatchedHit_T[sum]"] = (
        df["MatchedHit_T[0]"]
        + df["MatchedHit_T[1]"]
        + df["MatchedHit_T[2]"]
        + df["MatchedHit_T[3]"]
    )
    for i in range(4):
        df["MatchedHit_T_ratio[{}]".format(i)] = (
            df["MatchedHit_T[{}]".format(i)] / df["MatchedHit_T[sum]"]
        )

    for s1, s2 in [(0, 1), (0, 2), (0, 3), (1, 2), (1, 3), (2, 3)]:
        df["MatchedHit_T_diff[{},{}]".format(s2, s1)] = (
            df["MatchedHit_T[{}]".format(s2)] - df["MatchedHit_T[{}]".format(s1)]
        )
        df["MatchedHit_T_ratio[{},{}]".format(s2, s1)] = (
            df["MatchedHit_T[{}]".format(s1)] / df["MatchedHit_T[{}]".format(s2)]
        )

    return df

In [None]:
def prepare_df(df):
    """
    Function for preparing dataframe prior to model training/inference.
    """
    
    df = AggregatedFeaturesFromArrayFeatures(categorical_features).fit_transform(df)
    
    enrich_df_with_foi_station_hit_count(df)
    enrich_df_with_matchedhit_radial_features(df)
    enrich_df_with_matchedhit_time_features(df)
    enrich_df_with_matchhit_angle_and_radius_at_station_3D(df)
    enrich_df_with_Lextra_Matched_Hit_Vector(df)
    enrich_df_with_Lextra_matchedhit_radial_features(df)
    
    for s1, s2 in [
        (0, 1),
        (0, 2),
        (0, 3),
        (1, 2),
        (1, 3),
        (2, 3)
    ]:
        enrich_df_with_matchhit_angle_and_radius_between_two_stations(df, s1, s2)
        
    pipeline = make_pipeline(
        MultiColumnSum("ncl[sum]", ["ncl[0]", "ncl[1]", "ncl[2]", "ncl[3]"]),
    
        DivideTwoFeatures("ncl[0]", "ncl[sum]", "ncl_ratio[0]"),
        DivideTwoFeatures("ncl[1]", "ncl[sum]", "ncl_ratio[1]"),
        DivideTwoFeatures("ncl[2]", "ncl[sum]", "ncl_ratio[2]"),
        DivideTwoFeatures("ncl[3]", "ncl[sum]", "ncl_ratio[3]"),
        
        MultiColumnSum("MatchedHit_X+DX[0]", ["MatchedHit_X[0]", "MatchedHit_DX[0]"]),
        MultiColumnSum("MatchedHit_X+DX[1]", ["MatchedHit_X[1]", "MatchedHit_DX[1]"]),
        MultiColumnSum("MatchedHit_X+DX[2]", ["MatchedHit_X[2]", "MatchedHit_DX[2]"]),
        MultiColumnSum("MatchedHit_X+DX[3]", ["MatchedHit_X[3]", "MatchedHit_DX[3]"]),
        
        MultiColumnSum("MatchedHit_Y+DY[0]", ["MatchedHit_Y[0]", "MatchedHit_DY[0]"]),
        MultiColumnSum("MatchedHit_Y+DY[1]", ["MatchedHit_Y[1]", "MatchedHit_DY[1]"]),
        MultiColumnSum("MatchedHit_Y+DY[2]", ["MatchedHit_Y[2]", "MatchedHit_DY[2]"]),
        MultiColumnSum("MatchedHit_Y+DY[3]", ["MatchedHit_Y[3]", "MatchedHit_DY[3]"]),
        
        MultiColumnSum("MatchedHit_Z+DZ[0]", ["MatchedHit_Z[0]", "MatchedHit_DZ[0]"]),
        MultiColumnSum("MatchedHit_Z+DZ[1]", ["MatchedHit_Z[1]", "MatchedHit_DZ[1]"]),
        MultiColumnSum("MatchedHit_Z+DZ[2]", ["MatchedHit_Z[2]", "MatchedHit_DZ[2]"]),
        MultiColumnSum("MatchedHit_Z+DZ[3]", ["MatchedHit_Z[3]", "MatchedHit_DZ[3]"]),
        
        MultiColumnSum("MatchedHit_T+DT[0]", ["MatchedHit_T[0]", "MatchedHit_DT[0]"]),
        MultiColumnSum("MatchedHit_T+DT[1]", ["MatchedHit_T[1]", "MatchedHit_DT[1]"]),
        MultiColumnSum("MatchedHit_T+DT[2]", ["MatchedHit_T[2]", "MatchedHit_DT[2]"]),
        MultiColumnSum("MatchedHit_T+DT[3]", ["MatchedHit_T[3]", "MatchedHit_DT[3]"]),
        
        DivideTwoFeatures("MatchedHit_X[0]", "MatchedHit_X[1]", "MatchedHit_X_ratio[0,1]"),
        DivideTwoFeatures("MatchedHit_X[0]", "MatchedHit_X[2]", "MatchedHit_X_ratio[0,2]"),
        DivideTwoFeatures("MatchedHit_X[0]", "MatchedHit_X[3]", "MatchedHit_X_ratio[0,3]"),
        DivideTwoFeatures("MatchedHit_X[1]", "MatchedHit_X[2]", "MatchedHit_X_ratio[1,2]"),
        DivideTwoFeatures("MatchedHit_X[1]", "MatchedHit_X[3]", "MatchedHit_X_ratio[1,3]"),
        DivideTwoFeatures("MatchedHit_X[2]", "MatchedHit_X[3]", "MatchedHit_X_ratio[2,3]"),
        
        DivideTwoFeatures("MatchedHit_Y[0]", "MatchedHit_Y[1]", "MatchedHit_Y_ratio[0,1]"),
        DivideTwoFeatures("MatchedHit_Y[0]", "MatchedHit_Y[2]", "MatchedHit_Y_ratio[0,2]"),
        DivideTwoFeatures("MatchedHit_Y[0]", "MatchedHit_Y[3]", "MatchedHit_Y_ratio[0,3]"),
        DivideTwoFeatures("MatchedHit_Y[1]", "MatchedHit_Y[2]", "MatchedHit_Y_ratio[1,2]"),
        DivideTwoFeatures("MatchedHit_Y[1]", "MatchedHit_Y[3]", "MatchedHit_Y_ratio[1,3]"),
        DivideTwoFeatures("MatchedHit_Y[2]", "MatchedHit_Y[3]", "MatchedHit_Y_ratio[2,3]"),
        
        DivideTwoFeatures("MatchedHit_Z[0]", "MatchedHit_Z[1]", "MatchedHit_Z_ratio[0,1]"),
        DivideTwoFeatures("MatchedHit_Z[0]", "MatchedHit_Z[2]", "MatchedHit_Z_ratio[0,2]"),
        DivideTwoFeatures("MatchedHit_Z[0]", "MatchedHit_Z[3]", "MatchedHit_Z_ratio[0,3]"),
        DivideTwoFeatures("MatchedHit_Z[1]", "MatchedHit_Z[2]", "MatchedHit_Z_ratio[1,2]"),
        DivideTwoFeatures("MatchedHit_Z[1]", "MatchedHit_Z[3]", "MatchedHit_Z_ratio[1,3]"),
        DivideTwoFeatures("MatchedHit_Z[2]", "MatchedHit_Z[3]", "MatchedHit_Z_ratio[2,3]"),

        MultiplyFeatures("count[0]", ["ncl[0]", "avg_cs[0]"]),
        MultiplyFeatures("count[1]", ["ncl[1]", "avg_cs[1]"]),
        MultiplyFeatures("count[2]", ["ncl[2]", "avg_cs[2]"]),
        MultiplyFeatures("count[3]", ["ncl[3]", "avg_cs[3]"]),

        MultiColumnSum("count[sum]", ["count[0]", "count[1]", "count[2]", "count[3]"]),

        DivideTwoFeatures("count[0]", "count[sum]", "count_ratio[0]"),
        DivideTwoFeatures("count[1]", "count[sum]", "count_ratio[1]"),
        DivideTwoFeatures("count[2]", "count[sum]", "count_ratio[2]"),
        DivideTwoFeatures("count[3]", "count[sum]", "count_ratio[3]"),

        DivideTwoFeatures("count[1]", "count[0]", "count_ratio[1,0]"),
        DivideTwoFeatures("count[2]", "count[0]", "count_ratio[2,0]"),
        DivideTwoFeatures("count[3]", "count[0]", "count_ratio[3,0]"),
        DivideTwoFeatures("count[2]", "count[1]", "count_ratio[2,1]"),
        DivideTwoFeatures("count[3]", "count[1]", "count_ratio[3,1]"),
        DivideTwoFeatures("count[3]", "count[2]", "count_ratio[3,2]"),
        
        MultiColumnSum("avg_cs[sum]", ["avg_cs[0]", "avg_cs[1]", "avg_cs[2]", "avg_cs[3]"]),

        DivideTwoFeatures("avg_cs[0]", "avg_cs[sum]", "avg_cs_ratio[0]"),
        DivideTwoFeatures("avg_cs[1]", "avg_cs[sum]", "avg_cs_ratio[1]"),
        DivideTwoFeatures("avg_cs[2]", "avg_cs[sum]", "avg_cs_ratio[2]"),
        DivideTwoFeatures("avg_cs[3]", "avg_cs[sum]", "avg_cs_ratio[3]"),
        
        DivideTwoFeatures("avg_cs[1]", "avg_cs[0]", "avg_cs_ratio[1,0]"),
        DivideTwoFeatures("avg_cs[2]", "avg_cs[0]", "avg_cs_ratio[2,0]"),
        DivideTwoFeatures("avg_cs[3]", "avg_cs[0]", "avg_cs_ratio[3,0]"),
        DivideTwoFeatures("avg_cs[2]", "avg_cs[1]", "avg_cs_ratio[2,1]"),
        DivideTwoFeatures("avg_cs[3]", "avg_cs[1]", "avg_cs_ratio[3,1]"),
        DivideTwoFeatures("avg_cs[3]", "avg_cs[2]", "avg_cs_ratio[3,2]"),

        RadiusFromCoordinates("Lextra_R[0]", ["Lextra_X[0]", "Lextra_Y[0]"]),
        RadiusFromCoordinates("Lextra_R[1]", ["Lextra_X[1]", "Lextra_Y[1]"]),
        RadiusFromCoordinates("Lextra_R[2]", ["Lextra_X[2]", "Lextra_Y[2]"]),
        RadiusFromCoordinates("Lextra_R[3]", ["Lextra_X[3]", "Lextra_Y[3]"]),

        DivideTwoFeatures("Lextra_R[0]", "Lextra_R[1]", "Lextra_R_ratio[0,1]"),
        DivideTwoFeatures("Lextra_R[0]", "Lextra_R[2]", "Lextra_R_ratio[0,2]"),
        DivideTwoFeatures("Lextra_R[0]", "Lextra_R[3]", "Lextra_R_ratio[0,3]"),
        DivideTwoFeatures("Lextra_R[1]", "Lextra_R[2]", "Lextra_R_ratio[1,2]"),
        DivideTwoFeatures("Lextra_R[1]", "Lextra_R[3]", "Lextra_R_ratio[1,3]"),
        DivideTwoFeatures("Lextra_R[2]", "Lextra_R[3]", "Lextra_R_ratio[2,3]"),
      
        RadiusFromCoordinates("Lextra_Matched_Hit_Vector_R[0]", ["Lextra_Matched_Hit_Vector_X[0]", "Lextra_Matched_Hit_Vector_Y[0]"]),
        RadiusFromCoordinates("Lextra_Matched_Hit_Vector_R[1]", ["Lextra_Matched_Hit_Vector_X[1]", "Lextra_Matched_Hit_Vector_Y[1]"]),
        RadiusFromCoordinates("Lextra_Matched_Hit_Vector_R[2]", ["Lextra_Matched_Hit_Vector_X[2]", "Lextra_Matched_Hit_Vector_Y[2]"]),
        RadiusFromCoordinates("Lextra_Matched_Hit_Vector_R[3]", ["Lextra_Matched_Hit_Vector_X[3]", "Lextra_Matched_Hit_Vector_Y[3]"]),

        DivideTwoFeatures("Lextra_Matched_Hit_Vector_R[0]", "Lextra_Matched_Hit_Vector_R[1]", "Lextra_Matched_Hit_Vector_R_ratio[0,1]"),
        DivideTwoFeatures("Lextra_Matched_Hit_Vector_R[0]", "Lextra_Matched_Hit_Vector_R[2]", "Lextra_Matched_Hit_Vector_R_ratio[0,2]"),
        DivideTwoFeatures("Lextra_Matched_Hit_Vector_R[0]", "Lextra_Matched_Hit_Vector_R[3]", "Lextra_Matched_Hit_Vector_R_ratio[0,3]"),
        DivideTwoFeatures("Lextra_Matched_Hit_Vector_R[1]", "Lextra_Matched_Hit_Vector_R[2]", "Lextra_Matched_Hit_Vector_R_ratio[1,2]"),
        DivideTwoFeatures("Lextra_Matched_Hit_Vector_R[1]", "Lextra_Matched_Hit_Vector_R[3]", "Lextra_Matched_Hit_Vector_R_ratio[1,3]"),
        DivideTwoFeatures("Lextra_Matched_Hit_Vector_R[2]", "Lextra_Matched_Hit_Vector_R[3]", "Lextra_Matched_Hit_Vector_R_ratio[2,3]"),
      
        DivideTwoFeatures("Lextra_X[0]", "MatchedHit_X[0]", "Lextra_to_MachedHit_ratio_X[0]"),
        DivideTwoFeatures("Lextra_X[1]", "MatchedHit_X[1]", "Lextra_to_MachedHit_ratio_X[1]"),
        DivideTwoFeatures("Lextra_X[2]", "MatchedHit_X[2]", "Lextra_to_MachedHit_ratio_X[2]"),
        DivideTwoFeatures("Lextra_X[3]", "MatchedHit_X[3]", "Lextra_to_MachedHit_ratio_X[3]"),
        
        DivideTwoFeatures("Lextra_Y[0]", "MatchedHit_Y[0]", "Lextra_to_MachedHit_ratio_Y[0]"),
        DivideTwoFeatures("Lextra_Y[1]", "MatchedHit_Y[1]", "Lextra_to_MachedHit_ratio_Y[1]"),
        DivideTwoFeatures("Lextra_Y[2]", "MatchedHit_Y[2]", "Lextra_to_MachedHit_ratio_Y[2]"),
        DivideTwoFeatures("Lextra_Y[3]", "MatchedHit_Y[3]", "Lextra_to_MachedHit_ratio_Y[3]"),

        RadiusFromCoordinates("MatchedHit_R[0]", ["MatchedHit_X[0]", "MatchedHit_Y[0]", "MatchedHit_Z[0]"]),
        RadiusFromCoordinates("MatchedHit_R[1]", ["MatchedHit_X[1]", "MatchedHit_Y[1]", "MatchedHit_Z[1]"]),
        RadiusFromCoordinates("MatchedHit_R[2]", ["MatchedHit_X[2]", "MatchedHit_Y[2]", "MatchedHit_Z[2]"]),
        RadiusFromCoordinates("MatchedHit_R[3]", ["MatchedHit_X[3]", "MatchedHit_Y[3]", "MatchedHit_Z[3]"]),

        DivideTwoFeatures("MatchedHit_R[0]", "MatchedHit_R[1]", "MatchedHit_R_ratio[0,1]"),
        DivideTwoFeatures("MatchedHit_R[0]", "MatchedHit_R[2]", "MatchedHit_R_ratio[0,2]"),
        DivideTwoFeatures("MatchedHit_R[0]", "MatchedHit_R[3]", "MatchedHit_R_ratio[0,3]"),
        DivideTwoFeatures("MatchedHit_R[1]", "MatchedHit_R[2]", "MatchedHit_R_ratio[1,2]"),
        DivideTwoFeatures("MatchedHit_R[1]", "MatchedHit_R[3]", "MatchedHit_R_ratio[1,3]"),
        DivideTwoFeatures("MatchedHit_R[2]", "MatchedHit_R[3]", "MatchedHit_R_ratio[2,3]"),
        
        RadiusFromCoordinates("MatchedHit_R1[0]", ["MatchedHit_X[0]", "MatchedHit_Y[0]"]),
        RadiusFromCoordinates("MatchedHit_R1[1]", ["MatchedHit_X[1]", "MatchedHit_Y[1]"]),
        RadiusFromCoordinates("MatchedHit_R1[2]", ["MatchedHit_X[2]", "MatchedHit_Y[2]"]),
        RadiusFromCoordinates("MatchedHit_R1[3]", ["MatchedHit_X[3]", "MatchedHit_Y[3]"]),

        DivideTwoFeatures("MatchedHit_R1[0]", "MatchedHit_R1[1]", "MatchedHit_R1_ratio[0,1]"),
        DivideTwoFeatures("MatchedHit_R1[0]", "MatchedHit_R1[2]", "MatchedHit_R1_ratio[0,2]"),
        DivideTwoFeatures("MatchedHit_R1[0]", "MatchedHit_R1[3]", "MatchedHit_R1_ratio[0,3]"),
        DivideTwoFeatures("MatchedHit_R1[1]", "MatchedHit_R1[2]", "MatchedHit_R1_ratio[1,2]"),
        DivideTwoFeatures("MatchedHit_R1[1]", "MatchedHit_R1[3]", "MatchedHit_R1_ratio[1,3]"),
        DivideTwoFeatures("MatchedHit_R1[2]", "MatchedHit_R1[3]", "MatchedHit_R1_ratio[2,3]"),
      
        DivideTwoFeatures("MatchedHit_T[0]", "MatchedHit_T[1]", "MatchedHit_T_ratio[0,1]"),
        DivideTwoFeatures("MatchedHit_T[0]", "MatchedHit_T[2]", "MatchedHit_T_ratio[0,2]"),
        DivideTwoFeatures("MatchedHit_T[0]", "MatchedHit_T[3]", "MatchedHit_T_ratio[0,3]"),
        DivideTwoFeatures("MatchedHit_T[1]", "MatchedHit_T[2]", "MatchedHit_T_ratio[1,2]"),
        DivideTwoFeatures("MatchedHit_T[1]", "MatchedHit_T[3]", "MatchedHit_T_ratio[1,3]"),
        DivideTwoFeatures("MatchedHit_T[2]", "MatchedHit_T[3]", "MatchedHit_T_ratio[2,3]"),
        
        DivideTwoFeatures("Lextra_X[0]", "Lextra_X[1]", "Lextra_X_ratio[0,1]"),
        DivideTwoFeatures("Lextra_X[0]", "Lextra_X[2]", "Lextra_X_ratio[0,2]"),
        DivideTwoFeatures("Lextra_X[0]", "Lextra_X[3]", "Lextra_X_ratio[0,3]"),
        DivideTwoFeatures("Lextra_X[1]", "Lextra_X[2]", "Lextra_X_ratio[1,2]"),
        DivideTwoFeatures("Lextra_X[1]", "Lextra_X[3]", "Lextra_X_ratio[1,3]"),
        DivideTwoFeatures("Lextra_X[2]", "Lextra_X[3]", "Lextra_X_ratio[3,3]"),
      
        DivideTwoFeatures("Lextra_Y[0]", "Lextra_Y[1]", "Lextra_Y_ratio[0,1]"),
        DivideTwoFeatures("Lextra_Y[0]", "Lextra_Y[2]", "Lextra_Y_ratio[0,2]"),
        DivideTwoFeatures("Lextra_Y[0]", "Lextra_Y[3]", "Lextra_Y_ratio[0,3]"),
        DivideTwoFeatures("Lextra_Y[1]", "Lextra_Y[2]", "Lextra_Y_ratio[1,2]"),
        DivideTwoFeatures("Lextra_Y[1]", "Lextra_Y[3]", "Lextra_Y_ratio[1,3]"),
        DivideTwoFeatures("Lextra_Y[2]", "Lextra_Y[3]", "Lextra_Y_ratio[3,3]"),
      
        DivideTwoFeatures("Lextra_Matched_Hit_Vector_X[0]", "Lextra_Matched_Hit_Vector_X[1]", "Lextra_Matched_Hit_Vector_X_ratio[0,1]"),
        DivideTwoFeatures("Lextra_Matched_Hit_Vector_X[0]", "Lextra_Matched_Hit_Vector_X[2]", "Lextra_Matched_Hit_Vector_X_ratio[0,2]"),
        DivideTwoFeatures("Lextra_Matched_Hit_Vector_X[0]", "Lextra_Matched_Hit_Vector_X[3]", "Lextra_Matched_Hit_Vector_X_ratio[0,3]"),
        DivideTwoFeatures("Lextra_Matched_Hit_Vector_X[1]", "Lextra_Matched_Hit_Vector_X[2]", "Lextra_Matched_Hit_Vector_X_ratio[1,2]"),
        DivideTwoFeatures("Lextra_Matched_Hit_Vector_X[1]", "Lextra_Matched_Hit_Vector_X[3]", "Lextra_Matched_Hit_Vector_X_ratio[1,3]"),
        DivideTwoFeatures("Lextra_Matched_Hit_Vector_X[2]", "Lextra_Matched_Hit_Vector_X[3]", "Lextra_Matched_Hit_Vector_X_ratio[2,3]"),
  
        DivideTwoFeatures("Lextra_Matched_Hit_Vector_Y[0]", "Lextra_Matched_Hit_Vector_Y[1]", "Lextra_Matched_Hit_Vector_Y_ratio[0,1]"),
        DivideTwoFeatures("Lextra_Matched_Hit_Vector_Y[0]", "Lextra_Matched_Hit_Vector_Y[2]", "Lextra_Matched_Hit_Vector_Y_ratio[0,2]"),
        DivideTwoFeatures("Lextra_Matched_Hit_Vector_Y[0]", "Lextra_Matched_Hit_Vector_Y[3]", "Lextra_Matched_Hit_Vector_Y_ratio[0,3]"),
        DivideTwoFeatures("Lextra_Matched_Hit_Vector_Y[1]", "Lextra_Matched_Hit_Vector_Y[2]", "Lextra_Matched_Hit_Vector_Y_ratio[1,2]"),
        DivideTwoFeatures("Lextra_Matched_Hit_Vector_Y[1]", "Lextra_Matched_Hit_Vector_Y[3]", "Lextra_Matched_Hit_Vector_Y_ratio[1,3]"),
        DivideTwoFeatures("Lextra_Matched_Hit_Vector_Y[2]", "Lextra_Matched_Hit_Vector_Y[3]", "Lextra_Matched_Hit_Vector_Y_ratio[2,3]")
    )
    df = pipeline.fit_transform(df)
   
    df = category_encoders.OneHotEncoder(
        cols=[
            "ndof", 
            "MatchedHit_TYPE[0]", 
            "MatchedHit_TYPE[1]", 
            "MatchedHit_TYPE[2]", 
            "MatchedHit_TYPE[3]"
        ]
    ).fit_transform(df)
    
    drop_features = [
        'FOI_hits_X', 
        'FOI_hits_Y', 
        'FOI_hits_Z', 
        'FOI_hits_DX', 
        'FOI_hits_DY', 
        'FOI_hits_DZ', 
        'FOI_hits_T', 
        'FOI_hits_DT',
        'FOI_hits_S'
    ]
    df = FeaturesDropper(drop_features).fit_transform(df)
    
    # Replace characters in feature names that are not supported by xgboost
    df.columns = [col.replace("[", "(") for col in df.columns]
    df.columns = [col.replace("]", ")") for col in df.columns]
    df.columns = [col.replace("<", "_lt_") for col in df.columns]
    df.columns = [col.replace(">", "_gt_") for col in df.columns]
    
    return df

In [None]:
df_train_test = prepare_df(df_test[0:5])

In [None]:
df_train_test.shape

Resample training dataset to make the label distribution balanced.

In [None]:
resampler = RandomUnderSampler(random_state=21)
df_train_resampled, _ = resampler.fit_resample(df_train, df_train.label)
df_train_resampled = pd.DataFrame(df_train_resampled, columns=df_train.columns)

In [None]:
for col in list(df_train_resampled.columns):
    if col not in categorical_features:
        df_train_resampled[col] = pd.to_numeric(df_train_resampled[col])

Prepare train dataframe for training/inference.

In [None]:
df_train_resampled1 = prepare_df(df_train_resampled)

Define and train regression models to predict **weight**, **sWeight**, and **kinWeight** from other features.

In [None]:
# weight model
model1 = make_pipeline(lgb.LGBMRegressor(n_estimators=1000, random_state=21))
model1.fit(
    df_train_resampled1[df_train_resampled1.columns.difference(excluded_features)],
    df_train_resampled1.weight,
)

# sWeight model
model3 = make_pipeline(lgb.LGBMRegressor(n_estimators=1000, random_state=21))
model3.fit(
    df_train_resampled1[df_train_resampled1.columns.difference(excluded_features)],
    df_train_resampled1.sWeight,
)

# kinWeight model
model4 = make_pipeline(lgb.LGBMRegressor(n_estimators=1000, random_state=21))
model4.fit(
    df_train_resampled1[df_train_resampled1.columns.difference(excluded_features)],
    df_train_resampled1.kinWeight,
)

Make predictions from the **weight**, **sWeight** and **kinWeight** models with the training data and then concatenate the predictions to the original training dataframe as new features.

In [None]:
predictions_weight = model1.predict(
    df_train_resampled1[df_train_resampled1.columns.difference(excluded_features)],
)
predictions_weight = pd.DataFrame(predictions_weight, columns=['prediction_weight'])

predictions_sWeight = model3.predict(
    df_train_resampled1[df_train_resampled1.columns.difference(excluded_features)],
)
predictions_sWeight = pd.DataFrame(predictions_sWeight, columns=['prediction_sWeight'])

predictions_kinWeight = model4.predict(
    df_train_resampled1[df_train_resampled1.columns.difference(excluded_features)],
)
predictions_kinWeight=pd.DataFrame(predictions_kinWeight, columns=['prediction_kinWeight'])


df_train_resampled_all = pd.concat(
    [
        df_train_resampled1, 
        predictions_weight, 
        predictions_sWeight, 
        predictions_kinWeight
    ],
    axis=1
)

Define and train a classifier on the concatenated dataframe.

In [None]:
model2 = make_pipeline(lgb.LGBMClassifier(n_estimators=1000, random_state=21))
model2.fit(
    df_train_resampled_all[
        df_train_resampled_all.columns.difference(excluded_features)
    ],
    df_train_resampled_all.label,
)

Load private test data.

In [None]:
df_test_private = pd.read_csv("data/test_private_v2_track_1.csv")
df_test_private.set_index("id", inplace=True)
df_test_private.head()

Prepare private test dataframe for training/inference.

In [None]:
df_test_private1 = prepare_df(df_test_private)

Make predictions from private test data.

In [None]:
# weight predictions
predictions_weight_private = model1.predict(
    df_test_private1[df_test_private1.columns.difference(excluded_features)]
)
predictions_weight_private = pd.DataFrame(
    predictions_weight_private, columns=["prediction_weight"]
)

# sWeight predictions
predictions_sWeight_private = model3.predict(
    df_test_private1[df_test_private1.columns.difference(excluded_features)]
)
predictions_sWeight_private = pd.DataFrame(
    predictions_sWeight_private, columns=["prediction_sWeight"]
)

# kinWeight predictions
predictions_kinWeight_private = model4.predict(
    df_test_private1[df_test_private1.columns.difference(excluded_features)]
)
predictions_kinWeight_private = pd.DataFrame(
    predictions_kinWeight_private, columns=["prediction_kinWight"]
)

# Private dataset concatenated with weight, sWeight and kinWeight features
df_test_private_all = pd.concat(
    [
        df_test_private1,
        predictions_weight_private,
        predictions_sWeight_private,
        predictions_kinWeight_private,
    ],
    axis=1,
)
test_private_score = model2.predict_proba(
    df_test_private_all[df_test_private_all.columns.difference(excluded_features)]
)

Write submission csv file.

In [None]:
pd.DataFrame(
    data={"prediction": test_private_score[:, 1]}, index=df_test_private_all.index
).to_csv("submission_private_a.csv", index_label="id", header=True)