In [None]:
import numpy as np

import lightgbm as lgb

from sklearn.model_selection import train_test_split

from sklearn.datasets import make_classification

from scipy.sparse import csr_matrix

from sklearn.metrics import accuracy_score

In [None]:
# Generate sample dataset

X, y = make_classification(n_samples=1000, n_features=10, random_state=42)


In [None]:
# Split the data

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


1️⃣ Using LightGBM’s Dataset Storage Optimization

In [None]:
# Convert dataset to LightGBM binary format

train_data = lgb.Dataset(X_train, label=y_train)

train_data.save_binary('train_data.bin')

In [None]:
# Load from binary format

train_data_bin = lgb.Dataset('train_data.bin')

✅ Key Benefit: Saves memory & loads faster than traditional file formats.

In [None]:
# Train using optimized dataset

lgb_clf = lgb.train({

    'objective' : 'binary'
}
train_data_bin,
num_boost_round=10
)

2️⃣ Using Out-of-Core Training for Memory Efficiency

When data is too large to fit into RAM, we can train in batches from disk.

In [None]:
train_data = lgb.Dataset('large_dataset.csv', 
params={'max_bin':255},
free_row_data = False

)

3️⃣  Reducing Memory Usage with Sparse Matrices

Using scipy.sparse matrices saves memory by storing only non-zero values.

In [None]:
# Convert dataset to sparse format

X_train_sparse = csr_matrix(X_train)
X_test_sparse = csr_matrix(X_test)

In [None]:
# Train LightGBM with sparse data

lgb_sparse = lgb.LGBMClassifier(n_estimators=100)

lgb_sparse.fit(X_train_sparse, y_train)

✅ Key Benefit: Uses much less memory when working with sparse data.

Why Use Distributed Training?

When datasets become too large for a single machine, distributed training helps by spreading computations across multiple machines or GPUs.

✅ Key Benefits of Distributed Training in LightGBM:
Scales across multiple nodes (parallel training on clusters)
Efficient for huge datasets (millions or billions of rows)
Utilizes multiple CPUs or GPUs simultaneously


In [None]:
# Enable multi-threading

params = {

    'objective':'binary',
    'num_threads' : 8, # Use 8 CPU Threads
    'n_estimators': 200


}

In [None]:
# Train with the Data

lgb_clf = lgb.LGBMClassifier(**params)

lgb_clf.fit(X_train, y_train)

In [None]:
# Evaluate model


y_pred = lgb_clf.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))

4️⃣ Distributed Training Across Multiple Machines

LightGBM supports distributed training using multiple machines via MPI (Message Passing Interface).


📌 Steps to Set Up Distributed Training Across Machines
Install LightGBM with MPI support

In [None]:
pip install lightgbm
sudo apt-get install libopenmpi-dev openmpi-bin


Prepare data in LightGBM format (Each machine processes a part of the data)
Run training across multiple machines using MPI

In [None]:
mpirun -n 4 --host machine1,machine2,machine3,machine4 python train_lightgbm.py


5️⃣ Distributed Training on GPUs

LightGBM also supports GPU-accelerated distributed training.

In [None]:
params = {
    'objective': 'binary',
    'device': 'gpu',  # Use GPU
    'gpu_platform_id': 0,  # Specify GPU device
    'gpu_device_id': 0,
    'num_leaves': 31,
    'learning_rate': 0.05,
    'n_estimators': 200
}

In [None]:
lgb_clf_gpu = lgb.LGBMClassifier(**params)
lgb_clf_gpu.fit(X_train, y_train)

print("GPU Training Completed!")

✅ Key Benefit: Train massive datasets faster using multiple GPUs.