In [23]:
import numpy as np

import pandas as pd

import lightgbm as lgb

from sklearn.model_selection import train_test_split

from sklearn.metrics import accuracy_score

In [24]:
# Create synthetic dataset with categorical features

np.random.seed(42)

data = pd.DataFrame({

    'feature_1': np.random.choice(['A','B','C'], size=500),
    'feature_2': np.random.choice(['X','Y'], size=500),
    'numerical_feature': np.random.randn(500),
    'target': np.random.choice([0,1], size=500)

})

In [25]:
# Convert categorical columns to 'category' dtype

categorical_features = ['feature_1', 'feature_2']

for col in categorical_features:

    data[col]= data[col].astype('category')

In [26]:
# Split into train & test sets

X_train, X_test, y_train, y_test = train_test_split(data.drop(columns=['target']), data['target'],test_size=0.2, random_state=42)

In [27]:
# Convert datasets to LightGBM format

train_data = lgb.Dataset(X_train, label=y_train, categorical_feature=categorical_features)

test_data = lgb.Dataset(X_test, label=y_test, categorical_feature=categorical_features, reference=train_data)

In [28]:
# Train model

params = {

    'objective' : 'binary',
    'metric' : 'binary_logloss',
    'boosting_type':'gbdt'
}

In [29]:
model = lgb.train(

    params,
    train_data,
    valid_sets=[test_data],
    num_boost_round=100,
    callbacks=[

        lgb.early_stopping(stopping_rounds=10)
    ]

)

[LightGBM] [Info] Number of positive: 200, number of negative: 200
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000304 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 141
[LightGBM] [Info] Number of data points in the train set: 400, number of used features: 3
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[1]	valid_0's binary_logloss: 0.692926


In [30]:
# Predict

y_pred = (model.predict(X_test) >0.5).astype(int) 

print("Accuracy:", accuracy_score(y_test, y_pred))


Accuracy: 0.55
