In [None]:
import pandas as pd
import numpy as np

from lightgbm import LGBMClassifier
# Decicion Tree Classifier Model
from sklearn.tree import DecisionTreeClassifier
# Random Forest Model 
from sklearn.ensemble import RandomForestClassifier
# Extreme gradient boosting model
from xgboost import XGBClassifier
# Categorical Boosting Model 
from catboost import CatBoostClassifier, Pool
# KFold (Multiple model runs)
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder

# train data

In [None]:
df_train=pd.read_csv('data/train.csv')
label_encoder = LabelEncoder()
df_train['Target'] = label_encoder.fit_transform(df_train['Target'])
X=df_train.drop('Target',axis=1)
y=df_train['Target']

# lightgbm

In [None]:
def test_lgbm_model(X, y, lgbm_params=None,cv=5):
    '''
    This function tests the LGBM model and prints scores.

    Args:
        X: Dataframe with predictor features.
        y: Series with the target feature.
        lgbm_params: A dict of the LGBM tuned parameters (default=None).
        cv: cross validation

    Returns:
        average_score: float representing the average score calculated after 5 model folds.
        label_scores: list containing scores for each target feature (if by_target is True in test_params)
    '''
    folds = StratifiedKFold(n_splits=cv, random_state=123, shuffle=True)
    fin_scores = []
    label_scores = []
    unique_targets = y.unique()
    
    for i, (train_index, test_index) in enumerate(folds.split(X, y)):
        Xf_train = X.iloc[train_index]
        yf_train = y.iloc[train_index]

        X_test = X.iloc[test_index]
        y_test = y.iloc[test_index]

        if lgbm_params is not None:
            lgbmc = LGBMClassifier(**lgbm_params)
        else:
            lgbmc = LGBMClassifier(verbose=-1)

        lgbmc.fit(X=Xf_train, y=yf_train)

        y_pred = lgbmc.predict(X=X_test)

        target_scores=[]
        for target in unique_targets:
            target_mask = y_test == target
            target_pred = y_pred[target_mask]
            target_true = y_test[target_mask]
            score = accuracy_score(target_true, target_pred)
            target_scores.append(score)

        fin_scores.append(np.mean(target_scores))
        label_scores.extend(target_scores) 

    average_score = np.mean(fin_scores)

    return average_score, label_scores
# test_lgbm_model(X,y)

# random forest

In [None]:
def test_random_forest(X, y, by_target=False, rf_params=None,cv=5):
    '''
    This function tests the Random Forest model and prints scores.

    Args:
        X: Dataframe with predictor features.
        y: Series with the target feature.
        by_target: Print scores for each target (default=False).
        cv: cross validation

    Returns:
        average_score: float representing the average score calculated after 5 model folds.
        label_scores: list containing scores for each target feature (if by_target is True)
    '''
    folds = StratifiedKFold(n_splits=cv, random_state=123, shuffle=True)
    fin_scores = []
    label_scores = []
    unique_targets = y.unique()
    
    for i, (train_index, test_index) in enumerate(folds.split(X, y)):
        Xf_train = X.iloc[train_index]
        yf_train = y.iloc[train_index]

        X_test = X.iloc[test_index]
        y_test = y.iloc[test_index]
        
        if rf_params is not None:
            rfc = RandomForestClassifier(**rf_params)
        else:
            rfc = RandomForestClassifier()
            
        rfc.fit(Xf_train, yf_train)

        y_pred = rfc.predict(X_test)
        
        target_scores=[]
        for target in unique_targets:
            target_mask = y_test == target
            target_pred = y_pred[target_mask]
            target_true = y_test[target_mask]
            score = accuracy_score(target_true, target_pred)
            target_scores.append(score)

        fin_scores.append(np.mean(target_scores))  
        label_scores.extend(target_scores) 
        
    average_score = np.mean(fin_scores)
    
    return average_score, label_scores
# test_random_forest(X,y)

# xgboost

In [None]:
def test_xgboost(X, y, by_target=False, xgb_params=None,cv=5):
    '''
    This function tests the XGBoost model and prints scores.

    Args:
        X: Dataframe with predictor features.
        y: Series with the target feature.
        by_target: Print scores for each target (default=False).
        cv: cross validation

    Returns:
        average_score: float representing the average score calculated after 5 model folds.
        label_scores: list containing scores for each target feature (if by_target is True)
    '''
    folds = StratifiedKFold(n_splits=cv, random_state=123, shuffle=True)
    fin_scores = []
    label_scores = []
    unique_targets = y.unique()
    
    for i, (train_index, test_index) in enumerate(folds.split(X, y)):
        Xf_train = X.iloc[train_index]
        yf_train = y.iloc[train_index]

        X_test = X.iloc[test_index]
        y_test = y.iloc[test_index]

        if xgb_params is not None:
            xgb = XGBClassifier(**xgb_params)
        else:
            xgb = XGBClassifier(verbose=-1)

        xgb.fit(X=Xf_train, y=yf_train)

        y_pred = xgb.predict(X=X_test)

        target_scores=[]
        for target in unique_targets:
            target_mask = y_test == target
            target_pred = y_pred[target_mask]
            target_true = y_test[target_mask]
            score = accuracy_score(target_true, target_pred)
            target_scores.append(score)

        fin_scores.append(np.mean(target_scores)) 
        label_scores.extend(target_scores)  

    average_score = np.mean(fin_scores)

    return average_score, label_scores
# test_xgboost(X,y)

# catboost

In [None]:
def test_catboost(X, y, by_target=False, cat_params=None,cv=5):
    '''
    This function tests the CatBoost model and prints scores.

    Args:
        X: Dataframe with predictor features.
        y: Series with the target feature.
        by_target: Print scores for each target (default=False).
        cv: cross valiadtion

    Returns:
        average_score: float representing the average score calculated after 5 model folds.
        label_scores: list containing scores for each target feature (if by_target is True)
    '''
    folds = StratifiedKFold(n_splits=cv, random_state=123, shuffle=True)
    fin_scores = []
    label_scores = []
    unique_targets = y.unique()
    
    for i, (train_index, test_index) in enumerate(folds.split(X, y)):
        Xf_train = X.iloc[train_index]
        yf_train = y.iloc[train_index]

        X_test = X.iloc[test_index]
        y_test = y.iloc[test_index]
        
        if cat_params is not None:
            cat = CatBoostClassifier(**cat_params)
        else:
            cat = CatBoostClassifier(verbose=0)
            
        train_pool = Pool(
                data=Xf_train,
                label=yf_train
                )
        
        cat.fit(train_pool)

        y_pred = cat.predict(X_test)

        target_scores=[]
        for target in unique_targets:
            target_mask = y_test == target
            target_pred = y_pred[target_mask]
            target_true = y_test[target_mask]
            score = accuracy_score(target_true, target_pred)
            target_scores.append(score)

        fin_scores.append(np.mean(target_scores)) 
        label_scores.extend(target_scores) 

    average_score = np.mean(fin_scores)

    return average_score, label_scores

# params

In [None]:
lgbm_params = {
                 'objective': 'multiclass', # multiclass target: 'Graduated', 'Dropout', or 'Enrolled'
                 'data_sample_strategy': 'goss', # Gradient-based One-Sided Sampling
                 'tree_learner': 'feature', # split nodes based on the best feature
                 'n_estimators': 743, # number of boosting iterations
                 'learning_rate': 0.02636616162598401, # step size for updatig model weights
                 'feature_fraction': 0.298183729482288, # about 30% of features considered at each split
                 'lambda_l1': 8.242410039948067e-07, # L1 regulation penalization - adding magnitude of weights to the loss
                 'lambda_l2': 0.4063299210212167, # L2 regulation penalization = adding the square of weights to the loss
                 'num_leaves': 699, # Maximum number of leaves (terminal nodes) to use
                 'max_depth': 8, # Maximum tree depth (levels) allowed
                 'colsample_bytree': 0.7975468653525116, # proportion of samples to randomly choose at each iteration
                 'min_child_samples': 102, # Minimum number of samples needed per leaf
                 'min_sum_hessian_in_leaf': 5.440582524630883, # Minimum sum of squared gradients allowed in a leaf node
                 'min_gain_to_split': 0.7247318987185962, # Minumum gain (model score improvement) to make further leaf partitions
                 'max_bin': 156, # Maximum numer of bins used for discretitizing features before tree splits
                 'top_rate': 0.6132659772851583, # Top proportion of features to choose (~61%)
                 'verbose': -1, # Turn off warnings and model logs for a cleaner look
                 'random_state': 123 # Random state value for repeatablity
}
rf_params = {
    'n_estimators': 100, # number of trees in the forest
    'max_depth': 10, # maximum depth of individual trees
    'min_samples_split': 2, # minimum samples required to split a node
    'min_samples_leaf': 1, # minimum samples required at each leaf node
    'max_features': 'auto', # number of features considered at each split (default: 'auto')
    'bootstrap': True, # whether to use bootstrap aggregating (bagging)
    'random_state': 123 # for model repeatability
}
xgb_params = {
    'objective': 'multiclass', # Adjust for your problem type (classification or regression)
    'n_estimators': 100, # Number of boosting rounds
    'learning_rate': 0.1, # Learning rate (controls step size)
    'max_depth': 6, # Maximum depth of individual trees
    'min_child_weight': 1, # Minimum sum of hessian for child nodes
    'gamma': 0, # Minimum loss reduction required for a split
    'subsample': 0.8, # Fraction of samples for each boosting round
    'colsample_bytree': 0.8, # Fraction of features considered per tree
    'colsample_bylevel': 1, # Fraction of features considered per level
    'reg_alpha': 1, # L1 regularization term
    'reg_lambda': 0, # L2 regularization term
    'random_state': 123 # for model repeatability
}
cat_params = {
    #'loss_function': 'MultiClass', # adjust for your problem type
    #'eval_metric': 'MultiClass', # adjust for your problem (classification metric)
    'iterations': 100, # number of boosting rounds
    #'learning_rate': 0.1, # learning rate (controls step size)
    'depth': 4, # maximum depth of individual trees
    #'l2_leaf_reg': 3, # L2 regularization term on leaf values
    #'min_data_in_leaf': 10, # minimum samples required at each leaf node
    #'random_strength': 1, # amount of randomness in feature selection
    #'od_wait': 20, # patience for early stopping based on validation metric
    'verbose': 0, # turn off log output for readability
    'random_state': 123 # for model repeatability
}

# experiment

In [None]:
def test_params(X, y, by_target=False, model_type="lgbm", lgbm_params=None, rf_params=None, xgb_params=None, cat_params=None):
    '''
    This function checks the model type and calls the appropriate model testing function.

    Args:
        X: Dataframe with predictor features.
        y: Series with the target feature.
        by_target: Print scores for each target (default=False).
        model_type: Type of model to test (default="lgbm").
        lgbm_params: A dict of the LGBM tuned parameters (default=None).
        rf_params: A dict of the Random Forest tuned parameters (default=None).

    Returns:
        None - if only one score return None.
        max(scores) - if multiple scores, return the highest one.
  '''
    scores = []
    target_names = ['Graduate', 'Dropout', 'Enrolled']
    
    if model_type == "lgbm":
        average_score, label_scores = test_lgbm_model(X, y, lgbm_params)
    elif model_type == "random_forest":
        average_score, label_scores = test_random_forest(X, y, rf_params=rf_params)
    elif model_type == 'xgboost':
        average_score, label_scores = test_xgboost(X, y, xgb_params=xgb_params)
    elif model_type == 'catboost':
        average_score, label_scores = test_catboost(X, y, cat_params=cat_params)
    else:
        print(f"Model type '{model_type}' not supported.")
        return None

    if by_target:
        unique_targets = y.unique()
        label_score_names = unique_targets.tolist()
        
        for i, score in enumerate(zip(label_score_names, label_scores)):
            target_name, target_score = score
            print(f'Score for target {target_names[target_name]}: {target_score}')

    if len(scores) > 1:
        return max(scores)
    else:
        return average_score

In [None]:
test_params(X,y,by_target=True,model_type='lgbm',lgbm_params=lgbm_params)
test_params(X,y,by_target=True,model_type='random_forest',rf_params=rf_params)
test_params(X,y,by_target=True,model_type='xgboost',xgb_params=xgb_params)
test_params(X,y,by_target=True,model_type='catboost',cat_params=cat_params)

# train and voting

In [None]:
# Train LightGBM Model
LGBM_model = LGBMClassifier(**lgbm_params)  
LGBM_model.fit(X, y)

# Train XGBoost Model
XGBoost_model = XGBClassifier(**xgb_params) 
XGBoost_model.fit(X, y)

# Train Random Forest Model
RandomForest_model = RandomForestClassifier()  
RandomForest_model.fit(X, y)

# Train CatBoost Model
Categorical_Model = CatBoostClassifier(verbose=0) 
Categorical_Model.fit(X, y)

In [None]:
df_test=pd.read_csv('data/test.csv')

In [None]:
# Make predictions from each model
predictions_by_model = {}
for model_name, model in [('lgbm', LGBM_model),
                          ('xgboost', XGBoost_model), 
                          ('random_forest', RandomForest_model),
                          ('catboost', Categorical_Model)]:
    predictions_by_model[model_name] = model.predict(df_test)

# Final prediction selection based on class-wise strengths
final_predictions = []
for i in range(len(df_test)):
    lgbm_pred = predictions_by_model['lgbm'][i]
    xgboost_pred = predictions_by_model['xgboost'][i]
    rf_pred = predictions_by_model['random_forest'][i]
    cat_pred = predictions_by_model['catboost'][i]

    # Choose prediction based on class and model performance
    #if rf_pred in [0, 2]:
        #final_predictions.append(rf_pred)  # Use Random Forest if it predicts None
    if xgboost_pred == 2:
        final_predictions.append(xgboost_pred)  # Use XGBoost if it predicts 0
    elif lgbm_pred in [0, 1]:
        final_predictions.append(lgbm_pred)  # Use LGBM if it predicts 1 or 2
    else:
        # Default prediction of most frequent class 
        final_predictions.append(lgbm_pred)  

# output predictions

In [None]:
# Create the id series, and the final output dataframe
id_val = df_test['id']
output = pd.DataFrame({
    'id': id_val,
    'Target': final_predictions
})
output['Target']=label_encoder.inverse_transform(output['Target'])
submission = output.to_csv('submission.csv', index=False)