In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/turkiye-is-bankasi-ml-challenge-5/train_final.parquet
/kaggle/input/turkiye-is-bankasi-ml-challenge-5/test_final.parquet
/kaggle/input/turkiye-is-bankasi-ml-challenge-5/submission_sample_final.parquet


# Preprocessing, Imports & Helper Functions

In [2]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.metrics import jaccard_score
from sklearn.model_selection import train_test_split



In [3]:
# Define the paths
path_train = r"/kaggle/input/turkiye-is-bankasi-ml-challenge-5/train_final.parquet"

In [4]:
# Load Train Parquet File
train_df = pd.read_parquet(path=path_train)

In [5]:
import tensorflow.keras.backend as K

def jaccard_score(y_true, y_pred):
    y_true = K.round(y_true)  # Convert probabilities to binary values
    y_pred = K.round(y_pred)
    intersection = K.sum(y_true * y_pred, axis=-1)
    union = K.sum(K.maximum(y_true, y_pred), axis=-1)
    jaccard = intersection / (union + K.epsilon())  # Add epsilon to avoid division by zero
    return K.mean(jaccard)

In [6]:
def categorize_carrier(x):
    if x == "VODAFONE TR":
        return "VODAFONE TR"
    elif x == "TURKCELL":
        return "TURKCELL"
    elif x == "TURK TELEKOM":
        return "TURK TELEKOM"
    else:
        return "others"
    
def categorize_device_brand(x):
    if x == "Apple":
        return "Apple"
    elif x == "samsung":
        return "samsung"
    elif x == "xiaomi":
        return "xiaomi"
    elif x == "HUAWEI":
        return "HUAWEI"
    elif x == "OPPO":
        return "OPPO"
    elif x == "Redmi":
        return "Redmi"
    else:
        return "others"
    
def convert_menu_to_binary(row):
    binary_vector = [0] * 9
    menus = row['target'].split(', ')
    for menu in menus:
        menu_number = int(menu.split('menu')[1])
        binary_vector[menu_number - 1] = 1
    return ''.join(map(str, binary_vector))

In [7]:
train_df = train_df.drop(['id', 'month'], axis=1)
train_df["carrier"] = train_df["carrier"].apply(categorize_carrier)
train_df["devicebrand"] = train_df["devicebrand"].apply(categorize_device_brand)
train_df['target'] = train_df.apply(convert_menu_to_binary, axis=1) 

train_df.head()

Unnamed: 0,n_seconds_1,n_seconds_2,n_seconds_3,carrier,devicebrand,feature_0,feature_1,feature_2,feature_3,feature_4,...,feature_41,feature_42,feature_43,feature_44,feature_45,feature_46,feature_47,feature_48,feature_49,target
0,5245.571,981.182,205.948,VODAFONE TR,Apple,-1.197737,1.11336,-1.123334,-0.26358,2.161242,...,-2.613336,-2.032903,2.645719,-1.023478,1.658986,-1.559406,-2.161336,30,58,10110000
1,5184.876,557.65,487.587,TURKCELL,samsung,-2.336352,2.567766,-0.494908,0.949101,3.567557,...,-0.983938,-1.453756,-0.021547,-0.19577,2.775513,-0.31898,-4.291473,21,45,100110
2,3835.618,3275.128,43.806,TURK TELEKOM,Redmi,-2.561455,2.061736,-0.184511,1.062306,4.197788,...,-1.668703,-3.599403,1.673868,0.63179,1.293131,-2.230909,-2.383524,19,61,10100010
3,3532.544,154.509,64.724,TURKCELL,samsung,-2.529918,3.35805,-0.851366,1.643876,2.849205,...,-1.861418,-1.219658,1.863495,0.21317,1.02971,-1.142185,-4.466191,2,41,110001000
4,3344.192,787.896,715.115,VODAFONE TR,samsung,-2.922361,2.096124,0.060796,-1.487557,3.224788,...,-0.142903,-1.875545,1.024499,-0.186423,-0.061626,-1.462175,-2.371206,23,85,10001010


In [8]:
# Select the categorical feature columns
categorical_cols = ['carrier', 'devicebrand']
train_df_encoded = pd.get_dummies(train_df, columns=categorical_cols, dtype=int)

# Split the target variable into separate binary columns
target_columns = ['target_' + str(i) for i in range(9)]
train_df_encoded[target_columns] = train_df_encoded['target'].apply(lambda x: pd.Series([int(i) for i in list(x)]))

train_df_encoded.head()

Unnamed: 0,n_seconds_1,n_seconds_2,n_seconds_3,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,...,devicebrand_xiaomi,target_0,target_1,target_2,target_3,target_4,target_5,target_6,target_7,target_8
0,5245.571,981.182,205.948,-1.197737,1.11336,-1.123334,-0.26358,2.161242,2.651375,0.810021,...,0,0,1,0,1,1,0,0,0,0
1,5184.876,557.65,487.587,-2.336352,2.567766,-0.494908,0.949101,3.567557,3.357848,0.434091,...,0,0,0,0,1,0,0,1,1,0
2,3835.618,3275.128,43.806,-2.561455,2.061736,-0.184511,1.062306,4.197788,1.551181,-0.596218,...,0,0,1,0,1,0,0,0,1,0
3,3532.544,154.509,64.724,-2.529918,3.35805,-0.851366,1.643876,2.849205,3.887427,1.854521,...,0,1,1,0,0,0,1,0,0,0
4,3344.192,787.896,715.115,-2.922361,2.096124,0.060796,-1.487557,3.224788,2.091947,-0.992961,...,0,0,1,0,0,0,1,0,1,0


In [9]:
X = train_df_encoded.drop(columns=['target']).drop(columns=['target_' + str(i) for i in range(9)])
y = train_df_encoded[['target_' + str(i) for i in range(9)]]

#X_train, y_train = train_test_split(X, y, test_size=0, random_state=42) # Split the data

# 1. Model

In [None]:
model = tf.keras.Sequential([
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(256, activation='relu'),
    tf.keras.layers.Dense(512, activation='relu'),
    tf.keras.layers.Dense(1024, activation='relu'),
    tf.keras.layers.Dense(512, activation='relu'),
    tf.keras.layers.Dense(256, activation='relu'),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(9, activation='sigmoid')
])

In [None]:
model.compile(optimizer='adam',                   
                  loss='binary_crossentropy',                   
                  metrics=[jaccard_score])     

In [None]:
history = model.fit(X_train, y_train, epochs=100)

# 2. Saint Model

In [10]:
content_columns = ['feature_' + str(i) for i in range(48)]
X_content = X[content_columns]
X_context = X.drop(columns=content_columns)

In [11]:
# Function to create an embedding layer
def create_embedding_layer(input_dim, output_dim, input_length):
    return tf.keras.layers.Embedding(input_dim=input_dim, output_dim=output_dim, input_length=input_length)

# Function to create a multi-head self-attention layer
def create_multihead_self_attention(num_heads, key_dim, ff_dim, dropout=0.1):
    return tf.keras.layers.MultiHeadAttention(
        key_dim=key_dim, num_heads=num_heads, dropout=dropout
    )

# Function to create a feedforward layer
def create_feedforward_layer(ff_dim):
    return tf.keras.Sequential([
        tf.keras.layers.Dense(ff_dim, activation='relu'),
        tf.keras.layers.Dense(ff_dim, activation='relu')
    ])

In [12]:
# Function to create a SAINT block
def create_saint_block(num_heads, key_dim, ff_dim, cnt_seq_len, cnx_seq_len, dropout=0.1):
    content_input = tf.keras.layers.Input(shape=(cnt_seq_len,))
    context_input = tf.keras.layers.Input(shape=(cnx_seq_len,))
    
    # Embedding Layers
    content_embedding = create_embedding_layer(content_input.shape[1], 32, cnt_seq_len)(content_input)
    context_embedding = create_embedding_layer(context_input.shape[1], 32, cnx_seq_len)(context_input)
    
    # Multi-Head Self-Attention Layers
    content_attention = create_multihead_self_attention(num_heads, key_dim, ff_dim, dropout)(content_embedding, content_embedding)
    context_attention = create_multihead_self_attention(num_heads, key_dim, ff_dim, dropout)(context_embedding, context_embedding)
    
    # Feed Forward Layers
    content_ffn = create_feedforward_layer(ff_dim)(content_attention)
    context_ffn = create_feedforward_layer(ff_dim)(context_attention)
    
    # Combine Content and Context Streams
    content_output = tf.keras.layers.GlobalAveragePooling1D()(content_ffn)
    context_output = tf.keras.layers.GlobalAveragePooling1D()(context_ffn)
    output = tf.keras.layers.Concatenate()([content_output, context_output])
    
    # Additional Dense Layers and Output Layer
    output = tf.keras.layers.Dense(64, activation="relu")(output)
    output = tf.keras.layers.Dense(9, activation="sigmoid")(output)
    
    return tf.keras.models.Model(inputs=[content_input, context_input], outputs=output)

In [13]:
# Define the parameters
num_heads = 8
key_dim = 32
ff_dim = 64
cnt_seq_len = 48
cnx_seq_len = 16

In [14]:
# Create the SAINT model
saint_model = create_saint_block(num_heads, key_dim, ff_dim, cnt_seq_len, cnx_seq_len)

# Print a summary of the model
saint_model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 48)]         0           []                               
                                                                                                  
 input_2 (InputLayer)           [(None, 16)]         0           []                               
                                                                                                  
 embedding (Embedding)          (None, 48, 32)       1536        ['input_1[0][0]']                
                                                                                                  
 embedding_1 (Embedding)        (None, 16, 32)       512         ['input_2[0][0]']                
                                                                                              

In [15]:
saint_model.compile(optimizer='adam',                   
                  loss='binary_crossentropy',                   
                  metrics=[jaccard_score]) 

In [17]:
saint_history = saint_model.fit([X_content, X_context], y, epochs=100)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

# 3. Evaluation & Prediction

In [None]:
validation_loss, validation_metric = model.evaluate(X_val, y_val)

In [18]:
path_test = r"/kaggle/input/turkiye-is-bankasi-ml-challenge-5/test_final.parquet"

# Load Test Parquet File
test_df = pd.read_parquet(path=path_test)

test_df.head()

Unnamed: 0,id,month,n_seconds_1,n_seconds_2,n_seconds_3,carrier,devicebrand,feature_0,feature_1,feature_2,...,feature_40,feature_41,feature_42,feature_43,feature_44,feature_45,feature_46,feature_47,feature_48,feature_49
0,2e6105f5911256f4f6c4813ed,1,6893.544,246.854,242.636,VODAFONE TR,samsung,-1.723524,3.216489,-1.138474,...,-1.094519,-1.217407,-4.280456,1.51224,-2.306445,2.066388,0.844927,-1.026193,18,58
1,c56ad71dae0a5dbd3e7d36adc,1,4481.065,740.209,263.86,TURKCELL,Apple,-0.417275,2.024433,0.102952,...,1.806486,-3.477517,-2.064966,1.499805,1.284697,0.189269,-1.563224,-1.901654,3,35
2,4d02ea175f6581f0c6385311f,1,4340.702,2742.163,318.7,TURKCELL,samsung,-2.943294,2.769536,0.734942,...,1.75908,-2.038839,-2.067219,2.141083,0.055355,0.084739,-1.009925,-2.058473,7,50
3,3412d27a86c286ba078fa935c,1,4129.666,181.397,155.423,TURK TELEKOM,Apple,-2.346902,2.684752,0.168206,...,2.171847,-0.92504,-1.484278,0.666036,0.911519,0.616167,0.092304,-1.874706,22,47
4,0203b561f6f7e10eafa46eefa,1,3903.944,126.133,100.06,TURKCELL,POCO,-1.745354,2.355863,0.318961,...,-0.373413,-0.015773,-2.961445,1.301413,1.37509,-0.107355,0.92439,-1.606419,29,52


In [19]:
id_df = test_df["id"]
test_df = test_df.drop(['id', 'month'], axis=1)
test_df["carrier"] = test_df["carrier"].apply(categorize_carrier)
test_df["devicebrand"] = test_df["devicebrand"].apply(categorize_device_brand)

test_df.head()

Unnamed: 0,n_seconds_1,n_seconds_2,n_seconds_3,carrier,devicebrand,feature_0,feature_1,feature_2,feature_3,feature_4,...,feature_40,feature_41,feature_42,feature_43,feature_44,feature_45,feature_46,feature_47,feature_48,feature_49
0,6893.544,246.854,242.636,VODAFONE TR,samsung,-1.723524,3.216489,-1.138474,2.026997,2.24167,...,-1.094519,-1.217407,-4.280456,1.51224,-2.306445,2.066388,0.844927,-1.026193,18,58
1,4481.065,740.209,263.86,TURKCELL,Apple,-0.417275,2.024433,0.102952,-1.634336,3.621519,...,1.806486,-3.477517,-2.064966,1.499805,1.284697,0.189269,-1.563224,-1.901654,3,35
2,4340.702,2742.163,318.7,TURKCELL,samsung,-2.943294,2.769536,0.734942,1.681471,3.229447,...,1.75908,-2.038839,-2.067219,2.141083,0.055355,0.084739,-1.009925,-2.058473,7,50
3,4129.666,181.397,155.423,TURK TELEKOM,Apple,-2.346902,2.684752,0.168206,-1.072321,4.97148,...,2.171847,-0.92504,-1.484278,0.666036,0.911519,0.616167,0.092304,-1.874706,22,47
4,3903.944,126.133,100.06,TURKCELL,others,-1.745354,2.355863,0.318961,-0.570734,4.056542,...,-0.373413,-0.015773,-2.961445,1.301413,1.37509,-0.107355,0.92439,-1.606419,29,52


In [20]:
# Select the categorical feature columns
categorical_cols = ['carrier', 'devicebrand']
test_df_encoded = pd.get_dummies(test_df, columns=categorical_cols, dtype=int)
test_df_encoded.head()

Unnamed: 0,n_seconds_1,n_seconds_2,n_seconds_3,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,...,carrier_TURKCELL,carrier_VODAFONE TR,carrier_others,devicebrand_Apple,devicebrand_HUAWEI,devicebrand_OPPO,devicebrand_Redmi,devicebrand_others,devicebrand_samsung,devicebrand_xiaomi
0,6893.544,246.854,242.636,-1.723524,3.216489,-1.138474,2.026997,2.24167,1.7961,-0.212805,...,0,1,0,0,0,0,0,0,1,0
1,4481.065,740.209,263.86,-0.417275,2.024433,0.102952,-1.634336,3.621519,1.506006,1.993639,...,1,0,0,1,0,0,0,0,0,0
2,4340.702,2742.163,318.7,-2.943294,2.769536,0.734942,1.681471,3.229447,2.711587,1.075506,...,1,0,0,0,0,0,0,0,1,0
3,4129.666,181.397,155.423,-2.346902,2.684752,0.168206,-1.072321,4.97148,1.38691,0.515737,...,0,0,0,1,0,0,0,0,0,0
4,3903.944,126.133,100.06,-1.745354,2.355863,0.318961,-0.570734,4.056542,2.005356,0.515711,...,1,0,0,0,0,0,0,1,0,0


In [22]:
# For Saint Model
content_columns = ['feature_' + str(i) for i in range(48)]
test_df_encoded_content = test_df_encoded[content_columns]
test_df_encoded_context = test_df_encoded.drop(columns=content_columns)
y_pred = saint_model.predict([test_df_encoded_content,test_df_encoded_context])



In [None]:
# Predict on the test set
y_pred = model.predict(test_df)

In [23]:
def top_n_binary(prediction, n=3):
    """
    Converts the top 'n' values in the prediction array to 1, and the rest to 0.
    
    Args:
        prediction (numpy.ndarray): 1D array representing the model's prediction.
        n (int): Number of top values to set as '1'.

    Returns:
        numpy.ndarray: Binary sequence with 'n' highest values set to 1, and the rest to 0.
    """
    sorted_indices = prediction.argsort()[::-1]  # Get indices of sorted values in descending order
    binary_sequence = np.zeros_like(prediction)
    binary_sequence[sorted_indices[:n]] = 1
    return binary_sequence

In [24]:
binary_sequence = np.apply_along_axis(top_n_binary, 1, y_pred)
binary_sequence

array([[0., 1., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       ...,
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.]], dtype=float32)

In [25]:
columns_for = ['target_' + str(i) for i in range(9)]

In [26]:
binary_predictions = pd.DataFrame(binary_sequence, columns=columns_for)
binary_predictions = binary_predictions.astype(int).astype(str).apply(''.join, axis=1)
binary_predictions.head()

0    010011000
1    010101000
2    010101000
3    010101000
4    010101000
dtype: object

In [27]:
result_df = pd.concat([id_df, binary_predictions], axis=1)

In [28]:
result_df.rename(columns={0: 'target'}, inplace=True)

In [29]:
result_df.head()

Unnamed: 0,id,target
0,2e6105f5911256f4f6c4813ed,10011000
1,c56ad71dae0a5dbd3e7d36adc,10101000
2,4d02ea175f6581f0c6385311f,10101000
3,3412d27a86c286ba078fa935c,10101000
4,0203b561f6f7e10eafa46eefa,10101000


In [30]:
result_df.to_parquet('saint_predictions.parquet', index=False)

In [31]:
def count_non_three_ones(df, column_name='target'):
    count = 0
    for target in df[column_name]:
        if target.count('1') < 3:
            count += 1
    return count

con = count_non_three_ones(result_df)
con

0