In [None]:
!pip install pytorch-tabnet

In [None]:
import pandas as pd
import numpy as np

import warnings 
warnings.simplefilter('ignore')

from pytorch_tabnet.tab_model import TabNetClassifier           
from sklearn.metrics import accuracy_score, log_loss
from sklearn.model_selection import  KFold
import torch


import time
import gc
import os
from glob import glob
import optuna

from sklearn.preprocessing import StandardScaler, LabelEncoder

In [None]:
train = pd.read_csv('/kaggle/input/playground-series-s5e6/train.csv')
test = pd.read_csv('/kaggle/input/playground-series-s5e6/test.csv')
orig = pd.read_csv('/kaggle/input/original-dataset-s5e6/Fertilizer Prediction.csv')

In [None]:
def encode(df):
    soil_type_dict = {'Sandy' : 0, 'Black' : 1, 'Clayey' : 2, 'Red' : 3, 'Loamy' : 4}
    crop_type_dict = {'Paddy' : 0, 'Pulses' : 1, 'Cotton' : 2, 'Tobacco' : 3, 'Wheat' : 4, 'Millets' : 5, 'Barley' : 6, 'Sugarcane' : 7,
                     'Oil seeds' : 8, 'Maize' : 9, 'Ground Nuts' : 10}
    fertilizer_name_dict = {'10-26-26': 0, '14-35-14': 1, '17-17-17': 2, '20-20': 3,
        '28-28': 4, 'DAP': 5, 'Urea': 6}

    df['Soil Type'] = df['Soil Type'].replace(soil_type_dict)
    df['Crop Type'] = df['Crop Type'].replace(crop_type_dict)

    if 'Fertilizer Name' in df.columns:
        df['Fertilizer Name'] = df['Fertilizer Name'].replace(fertilizer_name_dict)

    #df['Soil Type'] = df['Soil Type'].astype('category')
    #df['Crop Type'] = df['Crop Type'].astype('category')

    return df


train = encode(train)
test = encode(test)
orig = encode(orig)

In [None]:
train.drop('id', axis=1, inplace=True)
test.drop('id', axis=1, inplace=True)

In [None]:
fertilizer_name_dict = {
    '10-26-26': 0, '14-35-14': 1, '17-17-17': 2, '20-20': 3,
    '28-28': 4, 'DAP': 5, 'Urea': 6
}
inverse_fert_dict = {v: k for k, v in fertilizer_name_dict.items()}

In [None]:

def apk(actual, predicted, k=3):
    if len(predicted) > k:
        predicted = predicted[:k]
    score = num_hits = 0.0
    for i, p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i+1)
    if not actual:
        return 0.0
    return score / min(len(actual), k)

def mapk(actual, predicted, k=3):
    return np.mean([apk(a, p, k) for a, p in zip(actual, predicted)])


In [None]:
os.makedirs("oof_predictions", exist_ok=True)
os.makedirs("submission_predictions", exist_ok=True)

In [None]:
FEATURES    = [c for c in train.columns if c not in ['id', 'Fertilizer Name']]
TARGET_COL  = 'Fertilizer Name'
FOLDS       = 5
fold_scores = []

n_classes = train[TARGET_COL].nunique()

oof_preds = np.zeros((len(train), n_classes))
test_preds = np.zeros((len(test), n_classes)) 

kf = KFold(n_splits=5, shuffle=True, random_state=42)
fold_scores = []

for fold, (tr_idx, val_idx) in enumerate(kf.split(train)):

    X_tr, y_tr = train.loc[tr_idx, FEATURES], train.loc[tr_idx, TARGET_COL]
    X_val, y_val = train.loc[val_idx, FEATURES], train.loc[val_idx, TARGET_COL]

    
    X_tr = pd.concat([X_tr, orig[FEATURES]], axis=0, ignore_index=True)
    y_tr = pd.concat([y_tr, orig[TARGET_COL]], axis=0, ignore_index=True)

    mean_ = X_tr.mean()
    std_  = X_tr.std().replace(0,1)


    X_tr  = ((X_tr - mean_) / std_).values
    X_val = ((X_val - mean_) / std_).values
    X_test_scaled = ((test - mean_) / std_).values

    model = TabNetClassifier(
        n_d=64,                
        n_a=64,                
        n_steps=5,            
        gamma=1.5,             
        n_independent=2,       
        n_shared=2,           
        optimizer_fn=torch.optim.Adam,                 
        optimizer_params=dict(lr=2e-2, weight_decay=1e-5),
        scheduler_params={"step_size":50, "gamma":0.9},
        scheduler_fn=torch.optim.lr_scheduler.StepLR,
        mask_type="sparsemax", 
        verbose=10,
        device_name="auto"     
    )


    
        # 3.2) Eğit
    model.fit(
        X_tr, y_tr.values,
        eval_set=[(X_val, y_val.values)],
        eval_name=["val"],
        eval_metric=["logloss","accuracy"],
        max_epochs=200,
        patience=30,
        batch_size=1024,      
        virtual_batch_size=256,
        num_workers=4
    )



    
   
    proba_val = model.predict_proba(X_val)
    oof_preds[val_idx] = proba_val
    
    test_preds += model.predict_proba(X_test_scaled) / FOLDS
    
    top3 = np.argsort(proba_val, axis=1)[:, -3:][:, ::-1]
    actual    = [[lab] for lab in y_val]
    predicted = top3.tolist()
    
    score = mapk(actual, predicted, k=3)
    fold_scores.append(score)
    print(f"Fold {fold} MAP@3: {score:.5f}")
        
    del X_tr, X_val, y_tr, y_val, proba_val, top3
    gc.collect()

avg_map3 = np.mean(fold_scores)
print(f"\n>>> Average MAP@3 over {len(fold_scores)} folds: {avg_map3:.5f}")

In [None]:
sample_sub = pd.read_csv('/kaggle/input/playground-series-s5e6/sample_submission.csv')

top3_preds = np.argsort(test_preds, axis=1)[:, -3:][:, ::-1]
top3_str = [
    " ".join(inverse_fert_dict[i] for i in row)
    for row in top3_preds
]

submission_df = pd.DataFrame({
    "id": sample_sub["id"],
    "Fertilizer Name": top3_str
})
submission_df.to_csv(f"submission_predictions/TabNET_cv{avg_map3}_submission.csv", index=False)

np.save(f"oof_predictions/TabNET.cv_{avg_map3}_oof_trial.npy", oof_preds)