In [1]:
import xgboost as xgb
from sklearn.metrics import jaccard_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

pd.set_option('display.max_columns', 100)

In [2]:
# Define the paths
path_train = "./turkiye-is-bankasi-ml-challenge-5/train_final.parquet"

In [3]:
# Load Train Parquet File
train_df = pd.read_parquet(path=path_train)

In [4]:
def categorize_carrier(x):
    if x == "VODAFONE TR":
        return "VODAFONE TR"
    elif x == "TURKCELL":
        return "TURKCELL"
    elif x == "TURK TELEKOM":
        return "TURK TELEKOM"
    else:
        return "others"
    
def categorize_device_brand(x):
    if x == "Apple":
        return "Apple"
    elif x == "samsung":
        return "samsung"
    elif x == "xiaomi":
        return "xiaomi"
    elif x == "HUAWEI":
        return "HUAWEI"
    elif x == "OPPO":
        return "OPPO"
    elif x == "Redmi":
        return "Redmi"
    else:
        return "others"
    
def convert_menu_to_binary(row):
    binary_vector = [0] * 9
    menus = row['target'].split(', ')
    for menu in menus:
        menu_number = int(menu.split('menu')[1])
        binary_vector[menu_number - 1] = 1
    return ''.join(map(str, binary_vector))


def top_n_binary(prediction, n=3):
    """
    Converts the top 'n' values in the prediction array to 1, and the rest to 0.
    
    Args:
        prediction (numpy.ndarray): 1D array representing the model's prediction.
        n (int): Number of top values to set as '1'.

    Returns:
        numpy.ndarray: Binary sequence with 'n' highest values set to 1, and the rest to 0.
    """
    sorted_indices = prediction.argsort()[::-1]  # Get indices of sorted values in descending order
    binary_sequence = np.zeros_like(prediction)
    binary_sequence[sorted_indices[:n]] = 1
    return binary_sequence

In [5]:
train_df = train_df.drop(['id', 'month'], axis=1)
train_df["carrier"] = train_df["carrier"].apply(categorize_carrier)
train_df["devicebrand"] = train_df["devicebrand"].apply(categorize_device_brand)
train_df['target'] = train_df.apply(convert_menu_to_binary, axis=1) 

train_df.head()

Unnamed: 0,n_seconds_1,n_seconds_2,n_seconds_3,carrier,devicebrand,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,feature_10,feature_11,feature_12,feature_13,feature_14,feature_15,feature_16,feature_17,feature_18,feature_19,feature_20,feature_21,feature_22,feature_23,feature_24,feature_25,feature_26,feature_27,feature_28,feature_29,feature_30,feature_31,feature_32,feature_33,feature_34,feature_35,feature_36,feature_37,feature_38,feature_39,feature_40,feature_41,feature_42,feature_43,feature_44,feature_45,feature_46,feature_47,feature_48,feature_49,target
0,5245.571,981.182,205.948,VODAFONE TR,Apple,-1.197737,1.11336,-1.123334,-0.26358,2.161242,2.651375,0.810021,1.516175,2.351266,-4.607631,0.678258,-0.555387,3.501726,-0.764794,1.543375,-1.457849,-1.017322,-1.81952,0.725629,-1.44358,1.087103,2.101993,-0.248796,-0.58292,-1.330125,1.407168,1.053509,-1.531511,1.667421,0.250819,-0.090907,2.67646,-0.620866,0.163055,1.720066,-6.13939,-0.302724,2.784268,-3.135499,-3.79442,0.93044,-2.613336,-2.032903,2.645719,-1.023478,1.658986,-1.559406,-2.161336,30,58,10110000
1,5184.876,557.65,487.587,TURKCELL,samsung,-2.336352,2.567766,-0.494908,0.949101,3.567557,3.357848,0.434091,0.885814,1.451937,-3.183905,1.023886,1.165963,2.91573,-1.280556,3.018781,-3.163132,-1.120173,-2.798378,0.287021,-2.029428,0.259852,0.961554,-0.533738,-1.311127,-0.143812,-1.561752,1.930543,-0.942348,-0.000383,-1.372437,1.22536,2.795093,-0.960509,0.113925,1.048365,-5.696538,0.901973,-0.073045,-2.366235,-0.599732,0.815496,-0.983938,-1.453756,-0.021547,-0.19577,2.775513,-0.31898,-4.291473,21,45,100110
2,3835.618,3275.128,43.806,TURK TELEKOM,Redmi,-2.561455,2.061736,-0.184511,1.062306,4.197788,1.551181,-0.596218,-0.618501,2.017303,-4.631071,-0.66349,-0.837522,1.821225,-2.171022,3.695091,-0.921562,-2.873224,-2.042132,0.532546,-0.127737,0.450119,2.716429,-0.178209,0.052197,-0.517862,-0.042886,1.15698,-0.527013,-0.088458,-0.497279,1.202904,2.742306,-0.622616,1.756662,2.058538,-5.359065,0.928563,1.1331,-3.099511,-1.253229,1.056681,-1.668703,-3.599403,1.673868,0.63179,1.293131,-2.230909,-2.383524,19,61,10100010
3,3532.544,154.509,64.724,TURKCELL,samsung,-2.529918,3.35805,-0.851366,1.643876,2.849205,3.887427,1.854521,0.988186,2.065699,-5.803766,1.93142,0.151038,3.144926,-2.67308,0.377882,-2.65695,-0.378486,-3.275756,0.00277,-0.674196,-0.227111,1.817213,0.06805,-0.786776,-1.707725,-1.999539,1.769581,0.04931,0.229993,-0.303635,1.659189,1.692415,-1.104764,1.242264,2.043422,-3.85403,0.648766,0.944281,-3.02672,-1.823863,0.203653,-1.861418,-1.219658,1.863495,0.21317,1.02971,-1.142185,-4.466191,2,41,110001000
4,3344.192,787.896,715.115,VODAFONE TR,samsung,-2.922361,2.096124,0.060796,-1.487557,3.224788,2.091947,-0.992961,0.686043,2.08975,-2.974129,-0.797499,-1.459318,2.39795,-1.308208,3.381416,-2.663701,-2.219488,-1.713531,1.287623,-2.250461,2.345008,0.402893,-0.076522,0.289082,1.847069,1.423561,1.888181,-1.02226,1.021054,-0.843386,-0.859217,2.565366,0.412179,0.067829,0.391379,-5.995943,2.675444,0.203494,-3.272419,-1.417781,2.188403,-0.142903,-1.875545,1.024499,-0.186423,-0.061626,-1.462175,-2.371206,23,85,10001010


In [6]:
# Select the categorical feature columns
categorical_cols = ['carrier', 'devicebrand']
train_df_encoded = pd.get_dummies(train_df, columns=categorical_cols, dtype=int)

# Split the target variable into separate binary columns
target_columns = ['target_' + str(i) for i in range(9)]
train_df_encoded[target_columns] = train_df_encoded['target'].apply(lambda x: pd.Series([int(i) for i in list(x)]))

train_df_encoded.head()

Unnamed: 0,n_seconds_1,n_seconds_2,n_seconds_3,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,feature_10,feature_11,feature_12,feature_13,feature_14,feature_15,feature_16,feature_17,feature_18,feature_19,feature_20,feature_21,feature_22,feature_23,feature_24,feature_25,feature_26,feature_27,feature_28,feature_29,feature_30,feature_31,feature_32,feature_33,feature_34,feature_35,feature_36,feature_37,feature_38,feature_39,feature_40,feature_41,feature_42,feature_43,feature_44,feature_45,feature_46,feature_47,feature_48,feature_49,target,carrier_TURK TELEKOM,carrier_TURKCELL,carrier_VODAFONE TR,carrier_others,devicebrand_Apple,devicebrand_HUAWEI,devicebrand_OPPO,devicebrand_Redmi,devicebrand_others,devicebrand_samsung,devicebrand_xiaomi,target_0,target_1,target_2,target_3,target_4,target_5,target_6,target_7,target_8
0,5245.571,981.182,205.948,-1.197737,1.11336,-1.123334,-0.26358,2.161242,2.651375,0.810021,1.516175,2.351266,-4.607631,0.678258,-0.555387,3.501726,-0.764794,1.543375,-1.457849,-1.017322,-1.81952,0.725629,-1.44358,1.087103,2.101993,-0.248796,-0.58292,-1.330125,1.407168,1.053509,-1.531511,1.667421,0.250819,-0.090907,2.67646,-0.620866,0.163055,1.720066,-6.13939,-0.302724,2.784268,-3.135499,-3.79442,0.93044,-2.613336,-2.032903,2.645719,-1.023478,1.658986,-1.559406,-2.161336,30,58,10110000,0,0,1,0,1,0,0,0,0,0,0,0,1,0,1,1,0,0,0,0
1,5184.876,557.65,487.587,-2.336352,2.567766,-0.494908,0.949101,3.567557,3.357848,0.434091,0.885814,1.451937,-3.183905,1.023886,1.165963,2.91573,-1.280556,3.018781,-3.163132,-1.120173,-2.798378,0.287021,-2.029428,0.259852,0.961554,-0.533738,-1.311127,-0.143812,-1.561752,1.930543,-0.942348,-0.000383,-1.372437,1.22536,2.795093,-0.960509,0.113925,1.048365,-5.696538,0.901973,-0.073045,-2.366235,-0.599732,0.815496,-0.983938,-1.453756,-0.021547,-0.19577,2.775513,-0.31898,-4.291473,21,45,100110,0,1,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,1,1,0
2,3835.618,3275.128,43.806,-2.561455,2.061736,-0.184511,1.062306,4.197788,1.551181,-0.596218,-0.618501,2.017303,-4.631071,-0.66349,-0.837522,1.821225,-2.171022,3.695091,-0.921562,-2.873224,-2.042132,0.532546,-0.127737,0.450119,2.716429,-0.178209,0.052197,-0.517862,-0.042886,1.15698,-0.527013,-0.088458,-0.497279,1.202904,2.742306,-0.622616,1.756662,2.058538,-5.359065,0.928563,1.1331,-3.099511,-1.253229,1.056681,-1.668703,-3.599403,1.673868,0.63179,1.293131,-2.230909,-2.383524,19,61,10100010,1,0,0,0,0,0,0,1,0,0,0,0,1,0,1,0,0,0,1,0
3,3532.544,154.509,64.724,-2.529918,3.35805,-0.851366,1.643876,2.849205,3.887427,1.854521,0.988186,2.065699,-5.803766,1.93142,0.151038,3.144926,-2.67308,0.377882,-2.65695,-0.378486,-3.275756,0.00277,-0.674196,-0.227111,1.817213,0.06805,-0.786776,-1.707725,-1.999539,1.769581,0.04931,0.229993,-0.303635,1.659189,1.692415,-1.104764,1.242264,2.043422,-3.85403,0.648766,0.944281,-3.02672,-1.823863,0.203653,-1.861418,-1.219658,1.863495,0.21317,1.02971,-1.142185,-4.466191,2,41,110001000,0,1,0,0,0,0,0,0,0,1,0,1,1,0,0,0,1,0,0,0
4,3344.192,787.896,715.115,-2.922361,2.096124,0.060796,-1.487557,3.224788,2.091947,-0.992961,0.686043,2.08975,-2.974129,-0.797499,-1.459318,2.39795,-1.308208,3.381416,-2.663701,-2.219488,-1.713531,1.287623,-2.250461,2.345008,0.402893,-0.076522,0.289082,1.847069,1.423561,1.888181,-1.02226,1.021054,-0.843386,-0.859217,2.565366,0.412179,0.067829,0.391379,-5.995943,2.675444,0.203494,-3.272419,-1.417781,2.188403,-0.142903,-1.875545,1.024499,-0.186423,-0.061626,-1.462175,-2.371206,23,85,10001010,0,0,1,0,0,0,0,0,0,1,0,0,1,0,0,0,1,0,1,0


In [7]:
X = train_df_encoded.drop(columns=['target']).drop(columns=['target_' + str(i) for i in range(9)])
y = train_df_encoded[['target_' + str(i) for i in range(9)]]

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42) # Split the data

In [8]:
# Initialize the XGBoost model
model = xgb.XGBClassifier(objective='binary:logistic', random_state=42)

In [9]:
# Train the model
model.fit(X_train, y_train)

In [10]:
# Predict on the val set
y_pred = model.predict(X_val)

In [11]:
# Calculate the Jaccard Score
jaccard = jaccard_score(y_val.values.flatten(), y_pred.flatten(), average='binary')
print(f"Jaccard Score: {jaccard}")

Jaccard Score: 0.4870769187430455


# 3. Test

In [12]:
# Define the paths
path_test = "./turkiye-is-bankasi-ml-challenge-5/test_final.parquet"

# Load Test Parquet File
test_df = pd.read_parquet(path=path_test)

test_df.head()

Unnamed: 0,id,month,n_seconds_1,n_seconds_2,n_seconds_3,carrier,devicebrand,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,feature_10,feature_11,feature_12,feature_13,feature_14,feature_15,feature_16,feature_17,feature_18,feature_19,feature_20,feature_21,feature_22,feature_23,feature_24,feature_25,feature_26,feature_27,feature_28,feature_29,feature_30,feature_31,feature_32,feature_33,feature_34,feature_35,feature_36,feature_37,feature_38,feature_39,feature_40,feature_41,feature_42,feature_43,feature_44,feature_45,feature_46,feature_47,feature_48,feature_49
0,2e6105f5911256f4f6c4813ed,1,6893.544,246.854,242.636,VODAFONE TR,samsung,-1.723524,3.216489,-1.138474,2.026997,2.24167,1.7961,-0.212805,0.447929,3.46516,-4.219648,-0.931751,3.633603,-0.555067,-2.298111,0.511194,-0.383306,-2.593233,-2.447223,0.924127,-0.411446,0.246188,2.257871,3.599789,-1.497506,0.516612,0.663096,1.871465,0.861411,-1.024769,-2.006591,0.209522,-0.097602,-2.985843,-0.042177,3.813231,-4.685382,-0.249806,2.27678,-0.367957,-1.724633,-1.094519,-1.217407,-4.280456,1.51224,-2.306445,2.066388,0.844927,-1.026193,18,58
1,c56ad71dae0a5dbd3e7d36adc,1,4481.065,740.209,263.86,TURKCELL,Apple,-0.417275,2.024433,0.102952,-1.634336,3.621519,1.506006,1.993639,0.434495,0.705718,-3.248426,-0.74533,-0.761663,3.166748,1.194949,3.198201,-0.674974,-0.555677,-1.829533,-1.155211,-1.77103,2.684586,0.857986,-0.147427,0.130127,0.207778,0.97187,0.35615,-3.598074,1.380936,-1.654721,-0.317826,2.26355,-0.277017,0.78862,-1.020164,-6.363883,0.656803,0.664108,-2.564899,-1.020139,1.806486,-3.477517,-2.064966,1.499805,1.284697,0.189269,-1.563224,-1.901654,3,35
2,4d02ea175f6581f0c6385311f,1,4340.702,2742.163,318.7,TURKCELL,samsung,-2.943294,2.769536,0.734942,1.681471,3.229447,2.711587,1.075506,0.104691,1.275551,-4.784873,-0.621247,0.928116,2.831212,-0.41981,3.24425,-1.674474,-2.556517,-2.589562,-0.821467,-0.831514,0.728772,2.415584,-1.964435,0.120592,-0.610942,-1.603177,0.148732,-1.516807,1.761628,-1.741026,0.819192,2.675048,-0.72442,0.7162,0.686554,-5.850377,0.464842,1.287392,-0.684942,-2.195384,1.75908,-2.038839,-2.067219,2.141083,0.055355,0.084739,-1.009925,-2.058473,7,50
3,3412d27a86c286ba078fa935c,1,4129.666,181.397,155.423,TURK TELEKOM,Apple,-2.346902,2.684752,0.168206,-1.072321,4.97148,1.38691,0.515737,0.62161,-0.354368,-2.95828,-1.215555,-1.326045,2.019457,-0.829898,4.686781,-2.994842,-2.031204,-1.832882,-0.959937,-2.212131,1.964668,0.470133,-0.860664,-0.312078,1.205403,-0.672658,1.095291,-0.621464,0.803356,-2.377215,-0.371055,3.786268,-0.030495,0.883739,-0.861583,-5.115179,1.867099,-0.37276,-1.930334,-1.264293,2.171847,-0.92504,-1.484278,0.666036,0.911519,0.616167,0.092304,-1.874706,22,47
4,0203b561f6f7e10eafa46eefa,1,3903.944,126.133,100.06,TURKCELL,POCO,-1.745354,2.355863,0.318961,-0.570734,4.056542,2.005356,0.515711,1.297831,0.46884,-4.917929,-1.603623,-1.475862,2.00984,-0.815374,3.730217,-1.009404,-2.430358,-1.781647,-0.569731,-2.180424,1.448957,1.125873,0.684027,0.001094,0.959921,0.682983,0.880568,-1.494863,-0.546022,-0.780245,0.16566,2.890538,-0.308851,1.651937,0.8456,-6.147421,1.067847,1.000501,-3.929198,-0.657711,-0.373413,-0.015773,-2.961445,1.301413,1.37509,-0.107355,0.92439,-1.606419,29,52


In [13]:
id_df = test_df["id"]
test_df = test_df.drop(['id', 'month'], axis=1)
test_df["carrier"] = test_df["carrier"].apply(categorize_carrier)
test_df["devicebrand"] = test_df["devicebrand"].apply(categorize_device_brand)

test_df.head()

Unnamed: 0,n_seconds_1,n_seconds_2,n_seconds_3,carrier,devicebrand,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,feature_10,feature_11,feature_12,feature_13,feature_14,feature_15,feature_16,feature_17,feature_18,feature_19,feature_20,feature_21,feature_22,feature_23,feature_24,feature_25,feature_26,feature_27,feature_28,feature_29,feature_30,feature_31,feature_32,feature_33,feature_34,feature_35,feature_36,feature_37,feature_38,feature_39,feature_40,feature_41,feature_42,feature_43,feature_44,feature_45,feature_46,feature_47,feature_48,feature_49
0,6893.544,246.854,242.636,VODAFONE TR,samsung,-1.723524,3.216489,-1.138474,2.026997,2.24167,1.7961,-0.212805,0.447929,3.46516,-4.219648,-0.931751,3.633603,-0.555067,-2.298111,0.511194,-0.383306,-2.593233,-2.447223,0.924127,-0.411446,0.246188,2.257871,3.599789,-1.497506,0.516612,0.663096,1.871465,0.861411,-1.024769,-2.006591,0.209522,-0.097602,-2.985843,-0.042177,3.813231,-4.685382,-0.249806,2.27678,-0.367957,-1.724633,-1.094519,-1.217407,-4.280456,1.51224,-2.306445,2.066388,0.844927,-1.026193,18,58
1,4481.065,740.209,263.86,TURKCELL,Apple,-0.417275,2.024433,0.102952,-1.634336,3.621519,1.506006,1.993639,0.434495,0.705718,-3.248426,-0.74533,-0.761663,3.166748,1.194949,3.198201,-0.674974,-0.555677,-1.829533,-1.155211,-1.77103,2.684586,0.857986,-0.147427,0.130127,0.207778,0.97187,0.35615,-3.598074,1.380936,-1.654721,-0.317826,2.26355,-0.277017,0.78862,-1.020164,-6.363883,0.656803,0.664108,-2.564899,-1.020139,1.806486,-3.477517,-2.064966,1.499805,1.284697,0.189269,-1.563224,-1.901654,3,35
2,4340.702,2742.163,318.7,TURKCELL,samsung,-2.943294,2.769536,0.734942,1.681471,3.229447,2.711587,1.075506,0.104691,1.275551,-4.784873,-0.621247,0.928116,2.831212,-0.41981,3.24425,-1.674474,-2.556517,-2.589562,-0.821467,-0.831514,0.728772,2.415584,-1.964435,0.120592,-0.610942,-1.603177,0.148732,-1.516807,1.761628,-1.741026,0.819192,2.675048,-0.72442,0.7162,0.686554,-5.850377,0.464842,1.287392,-0.684942,-2.195384,1.75908,-2.038839,-2.067219,2.141083,0.055355,0.084739,-1.009925,-2.058473,7,50
3,4129.666,181.397,155.423,TURK TELEKOM,Apple,-2.346902,2.684752,0.168206,-1.072321,4.97148,1.38691,0.515737,0.62161,-0.354368,-2.95828,-1.215555,-1.326045,2.019457,-0.829898,4.686781,-2.994842,-2.031204,-1.832882,-0.959937,-2.212131,1.964668,0.470133,-0.860664,-0.312078,1.205403,-0.672658,1.095291,-0.621464,0.803356,-2.377215,-0.371055,3.786268,-0.030495,0.883739,-0.861583,-5.115179,1.867099,-0.37276,-1.930334,-1.264293,2.171847,-0.92504,-1.484278,0.666036,0.911519,0.616167,0.092304,-1.874706,22,47
4,3903.944,126.133,100.06,TURKCELL,others,-1.745354,2.355863,0.318961,-0.570734,4.056542,2.005356,0.515711,1.297831,0.46884,-4.917929,-1.603623,-1.475862,2.00984,-0.815374,3.730217,-1.009404,-2.430358,-1.781647,-0.569731,-2.180424,1.448957,1.125873,0.684027,0.001094,0.959921,0.682983,0.880568,-1.494863,-0.546022,-0.780245,0.16566,2.890538,-0.308851,1.651937,0.8456,-6.147421,1.067847,1.000501,-3.929198,-0.657711,-0.373413,-0.015773,-2.961445,1.301413,1.37509,-0.107355,0.92439,-1.606419,29,52


In [14]:
# Select the categorical feature columns
categorical_cols = ['carrier', 'devicebrand']
test_df_encoded = pd.get_dummies(test_df, columns=categorical_cols, dtype=int)
test_df_encoded.head()

Unnamed: 0,n_seconds_1,n_seconds_2,n_seconds_3,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,feature_10,feature_11,feature_12,feature_13,feature_14,feature_15,feature_16,feature_17,feature_18,feature_19,feature_20,feature_21,feature_22,feature_23,feature_24,feature_25,feature_26,feature_27,feature_28,feature_29,feature_30,feature_31,feature_32,feature_33,feature_34,feature_35,feature_36,feature_37,feature_38,feature_39,feature_40,feature_41,feature_42,feature_43,feature_44,feature_45,feature_46,feature_47,feature_48,feature_49,carrier_TURK TELEKOM,carrier_TURKCELL,carrier_VODAFONE TR,carrier_others,devicebrand_Apple,devicebrand_HUAWEI,devicebrand_OPPO,devicebrand_Redmi,devicebrand_others,devicebrand_samsung,devicebrand_xiaomi
0,6893.544,246.854,242.636,-1.723524,3.216489,-1.138474,2.026997,2.24167,1.7961,-0.212805,0.447929,3.46516,-4.219648,-0.931751,3.633603,-0.555067,-2.298111,0.511194,-0.383306,-2.593233,-2.447223,0.924127,-0.411446,0.246188,2.257871,3.599789,-1.497506,0.516612,0.663096,1.871465,0.861411,-1.024769,-2.006591,0.209522,-0.097602,-2.985843,-0.042177,3.813231,-4.685382,-0.249806,2.27678,-0.367957,-1.724633,-1.094519,-1.217407,-4.280456,1.51224,-2.306445,2.066388,0.844927,-1.026193,18,58,0,0,1,0,0,0,0,0,0,1,0
1,4481.065,740.209,263.86,-0.417275,2.024433,0.102952,-1.634336,3.621519,1.506006,1.993639,0.434495,0.705718,-3.248426,-0.74533,-0.761663,3.166748,1.194949,3.198201,-0.674974,-0.555677,-1.829533,-1.155211,-1.77103,2.684586,0.857986,-0.147427,0.130127,0.207778,0.97187,0.35615,-3.598074,1.380936,-1.654721,-0.317826,2.26355,-0.277017,0.78862,-1.020164,-6.363883,0.656803,0.664108,-2.564899,-1.020139,1.806486,-3.477517,-2.064966,1.499805,1.284697,0.189269,-1.563224,-1.901654,3,35,0,1,0,0,1,0,0,0,0,0,0
2,4340.702,2742.163,318.7,-2.943294,2.769536,0.734942,1.681471,3.229447,2.711587,1.075506,0.104691,1.275551,-4.784873,-0.621247,0.928116,2.831212,-0.41981,3.24425,-1.674474,-2.556517,-2.589562,-0.821467,-0.831514,0.728772,2.415584,-1.964435,0.120592,-0.610942,-1.603177,0.148732,-1.516807,1.761628,-1.741026,0.819192,2.675048,-0.72442,0.7162,0.686554,-5.850377,0.464842,1.287392,-0.684942,-2.195384,1.75908,-2.038839,-2.067219,2.141083,0.055355,0.084739,-1.009925,-2.058473,7,50,0,1,0,0,0,0,0,0,0,1,0
3,4129.666,181.397,155.423,-2.346902,2.684752,0.168206,-1.072321,4.97148,1.38691,0.515737,0.62161,-0.354368,-2.95828,-1.215555,-1.326045,2.019457,-0.829898,4.686781,-2.994842,-2.031204,-1.832882,-0.959937,-2.212131,1.964668,0.470133,-0.860664,-0.312078,1.205403,-0.672658,1.095291,-0.621464,0.803356,-2.377215,-0.371055,3.786268,-0.030495,0.883739,-0.861583,-5.115179,1.867099,-0.37276,-1.930334,-1.264293,2.171847,-0.92504,-1.484278,0.666036,0.911519,0.616167,0.092304,-1.874706,22,47,1,0,0,0,1,0,0,0,0,0,0
4,3903.944,126.133,100.06,-1.745354,2.355863,0.318961,-0.570734,4.056542,2.005356,0.515711,1.297831,0.46884,-4.917929,-1.603623,-1.475862,2.00984,-0.815374,3.730217,-1.009404,-2.430358,-1.781647,-0.569731,-2.180424,1.448957,1.125873,0.684027,0.001094,0.959921,0.682983,0.880568,-1.494863,-0.546022,-0.780245,0.16566,2.890538,-0.308851,1.651937,0.8456,-6.147421,1.067847,1.000501,-3.929198,-0.657711,-0.373413,-0.015773,-2.961445,1.301413,1.37509,-0.107355,0.92439,-1.606419,29,52,0,1,0,0,0,0,0,0,1,0,0


In [15]:
# Predict on the test set
y_pred = model.predict(test_df_encoded)

In [16]:
binary_sequence = np.apply_along_axis(top_n_binary, 1, y_pred)
binary_sequence

array([[0., 1., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 1., 1.],
       [0., 1., 0., ..., 0., 1., 1.],
       ...,
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 1., 0., ..., 0., 0., 0.]])

In [17]:
binary_predictions = pd.DataFrame(binary_sequence, columns=target_columns)
binary_predictions = binary_predictions.astype(int).astype(str).apply(''.join, axis=1)

In [18]:
result_df = pd.DataFrame({'id': id_df, 'target': binary_predictions})

In [19]:
result_df.head()

Unnamed: 0,id,target
0,2e6105f5911256f4f6c4813ed,10100010
1,c56ad71dae0a5dbd3e7d36adc,1011
2,4d02ea175f6581f0c6385311f,10000011
3,3412d27a86c286ba078fa935c,10000011
4,0203b561f6f7e10eafa46eefa,10001010


In [20]:
# Save to parquet file
result_df.to_parquet('xgb_predictions.parquet', index=False)

In [21]:
def count_non_three_ones(df, column_name='target'):
    count = 0
    for target in df[column_name]:
        if target.count('1') != 3:
            count += 1
    return count

con = count_non_three_ones(result_df)
con

0