In [0]:
import numpy as np
import pandas as pd
import lightgbm as lgb
import multiprocessing
from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor, as_completed
import time

In [0]:
row_count = 5000
hotel_ids = [i for i in range(1,11)]
col_count = 21
X_vars = [np.random.rand(row_count, 1) * np.random.randint(2, 10) for i in range(col_count)]
Y_vars = [x_var * (index + 2) for index, x_var in enumerate(X_vars)]

X_cols = [f"X{i}" for i in range(len(Y_vars))]
Y_cols = [f"Y{i}" for i in range(len(Y_vars))]
hotel_df = pd.DataFrame(hotel_ids, columns=['HotelID'])
all_data = pd.DataFrame(
    np.concatenate(Y_vars + X_vars, axis=1), columns=Y_cols + X_cols
)

hotel_df = pd.merge(hotel_df, all_data, how='cross')

In [0]:
hotel_df_sprk = spark.createDataFrame(hotel_df)

In [0]:
def train_inner_model(train_data: pd.DataFrame, y_col: str) -> None:
    model_ = lgb.LGBMRegressor(verbose=-1, n_jobs=1)
    model_.fit(X=train_data[X_cols], y=train_data[y_col]) 

    return y_col,model_

In [0]:
# Model train with ThreadPoolExecutor
def train_model(id, train_data: pd.DataFrame):
    model_list_tp_ = {}
    n_threads = 11
    Y_cols = ['Y0']  # Assuming Y_cols is predefined
    # X_cols = ['X0', 'X1']  # Replace with the actual feature columns from your data

    # Train models for each target (Y_col) in parallel
    with ThreadPoolExecutor(max_workers=n_threads) as executor:
        future_to_target = {executor.submit(
            train_inner_model, train_data, y_col): y_col for y_col in Y_cols}
        
        for future in as_completed(future_to_target):
            forecast_point = future_to_target[future]                    
            try:
                col_, model = future.result()
            except Exception as exc:
                print(exc)
            else:
                model_list_tp_[col_] = model

    # Make predictions using the trained model
    pred_list = model_list_tp_['Y0'].predict(train_data[X_cols].reset_index(drop=True))

    # Convert predictions to native Python float
    pred_list = [float(num) for num in pred_list]
    
    # Create a Pandas DataFrame with the HotelID and predictions
    result_df = pd.DataFrame({
        'HotelID': train_data['HotelID'],
        'Y0': pred_list
    })
    
    return result_df

# Apply the train_model function using applyInPandas
output = hotel_df_sprk.groupby('HotelID').applyInPandas(
    train_model, schema="HotelID integer, Y0 double"
)

display(output)

HotelID,Y0
7,0.2784106397489441
7,4.691943375631299
7,0.2838056093589325
7,5.490857491225539
7,0.2103166135175545
7,3.424496616450732
7,0.8699813832999809
7,4.876195430901708
7,1.293400216247972
7,3.941902274153348


In [0]:
def train_inner_model2(train_data: pd.DataFrame, y_col: str) -> None:
    model_ = lgb.LGBMRegressor(verbose=-1, num_threads=1)
    model_.fit(X=train_data[X_cols], y=train_data[y_col]) 

    return y_col,model_

# Function to train the model on each worker node
def train_model2(id, train_data: pd.DataFrame):
    model_list_pp_ = {}
    n_threads = 4
    Y_cols = ['Y0']  # Assuming Y_cols is predefined

    # Train models for each target (Y_col) in parallel
    with ProcessPoolExecutor(max_workers=n_threads) as executor:
        future_to_target = {executor.submit(
            train_inner_model2, train_data, y_col): y_col for y_col in Y_cols}
        
        for future in as_completed(future_to_target):
            forecast_point = future_to_target[future]                    
            try:
                col_, model = future.result()
                print(f"Trained model for: {col_}")
            except Exception as exc:
                print(f"Model training failed for {forecast_point}: {exc}")
            else:
                model_list_pp_[col_] = model

    # Debugging: Check if 'Y0' model was successfully trained
    if 'Y0' not in model_list_pp_:
        raise KeyError("'Y0' not found in model_list_pp_")

    # Make predictions using the trained model
    pred_list = model_list_pp_['Y0'].predict(train_data[X_cols].reset_index(drop=True))

    # Convert predictions to native Python float
    pred_list = [float(num) for num in pred_list]
    
    # Create a Pandas DataFrame with the HotelID and predictions
    result_df = pd.DataFrame({
        'HotelID': train_data['HotelID'],
        'Y0': pred_list
    })
    
    return result_df

# Apply the train_model function using applyInPandas
output = hotel_df_sprk.groupby('HotelID').applyInPandas(
    train_model2, schema="HotelID integer, Y0 double"
)

display(output)
