In [None]:
def split_data_by_ticker(df, test_percentage=0.2):
    """
    Splits the data into training and testing datasets based on the given percentage 
    for each ticker.
    
    Args:
    - df (DataFrame): DataFrame containing 'DayNumber', 'Ticker', and other columns like 'Price'.
    - test_percentage (float): Percentage of data to be used for testing. Defaults to 0.2 (20% for testing, 80% for training).
    
    Returns:
    - train_df (DataFrame): Training dataset with all tickers.
    - test_df (DataFrame): Testing dataset with all tickers.
    """
    train_data = []
    test_data = []

    # Group by Ticker and iterate through each group
    for ticker, group in df.groupby('Ticker'):
        # Sort each ticker's group by DayNumber before splitting
        group = group.sort_values(by='Day_Number')

        n_samples = group.shape[0]
        test_size = int(n_samples * test_percentage)
        print(f"Ticker: {ticker}, Samples: {n_samples}, Test size: {test_size}")
        
        # Split the group into train and test datasets
        train = group.iloc[:-test_size]  # Take the first part as training
        test = group.iloc[-test_size:]   # Take the last part as testing

        # Append to the respective lists
        train_data.append(train)
        test_data.append(test)
        
    # Concatenate all the individual train and test data for all tickers
    train_df = pd.concat(train_data).sort_values(by='Day_Number')  # Sort by DayNumber after concatenating
    test_df = pd.concat(test_data).sort_values(by='Day_Number')    # Sort by DayNumber after concatenating

    return train_df, test_df

In [None]:
import pandas as pd
from xgboost import XGBRegressor
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import TimeSeriesSplit, RandomizedSearchCV
from scipy.stats import uniform as sp_uniform
from sklearn.metrics import mean_squared_error

# Assuming the dataset has a 'ticker' column and 'Close' column
tickers = df['ticker'].unique()

# Define the preprocessing steps
numeric_preprocessor = Pipeline([ 
    ("scaler", StandardScaler())
])

categorical_preprocessor = Pipeline([
    ("onehot", OneHotEncoder(sparse_output=False, drop='first', handle_unknown='infrequent_if_exist'))
])

preprocessor = ColumnTransformer([
    ("numerical", numeric_preprocessor, numerical_cols),
    ("categorical", categorical_preprocessor, categorical_cols)
], remainder='passthrough')

# Define the XGBoost regressor model
xgb_model = XGBRegressor(random_state=0)

# Define parameter grid for RandomizedSearchCV
param_grid = {
    'regressor__learning_rate': sp_uniform(loc=0.03, scale=0.07),
    'regressor__max_depth': [4, 5, 6, 7],
    'regressor__n_estimators': [300, 400, 500, 600, 700, 800, 900, 1000]
}

# Perform TimeSeriesSplit
cv = TimeSeriesSplit(n_splits=5)

# Initialize a dictionary to store models for each ticker
ticker_models = {}

for ticker in tickers:
    # Filter data for the current ticker
    ticker_data = df[df['ticker'] == ticker]
    X_ticker = ticker_data.drop(columns=['Close', 'ticker'])  # Features
    y_ticker = ticker_data['Close']  # Target (Close price)
    
    # Define the pipeline for this specific ticker
    pipe = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', xgb_model)
    ])
    
    # Perform randomized search for the best hyperparameters
    grid_search = RandomizedSearchCV(pipe, param_distributions=param_grid, cv=cv, random_state=0, n_jobs=-1, n_iter=15, 
                                     error_score='raise', scoring='neg_root_mean_squared_error')
    
    grid_search.fit(X_ticker, y_ticker)
    
    # Store the best model for the ticker
    ticker_models[ticker] = grid_search.best_estimator_

    print(f"Best model for ticker {ticker}:")
    print(grid_search.best_params_)

# Now you can use these models for predictions on new data:
predictions = {}
for ticker, model in ticker_models.items():
    ticker_data = df[df['ticker'] == ticker]  # Get the data for the ticker
    X_ticker = ticker_data.drop(columns=['Close', 'ticker'])
    y_ticker = ticker_data['Close']
    
    # Make predictions
    y_pred = model.predict(X_ticker)
    
    # Store predictions
    predictions[ticker] = y_pred
    
    # You can evaluate the model as well
    mse = mean_squared_error(y_ticker, y_pred)
    print(f"Mean Squared Error for {ticker}: {mse}")


In [None]:
# Example of combining predictions by averaging
combined_predictions = []

for ticker, model in ticker_models.items():
    ticker_data = df[df['ticker'] == ticker]
    X_ticker = ticker_data.drop(columns=['Close', 'ticker'])
    
    # Get predictions from the model
    y_pred = model.predict(X_ticker)
    
    # Append to the combined predictions list
    combined_predictions.extend(y_pred)

# You can now evaluate the combined predictions or use them for further analysis
combined_predictions = np.array(combined_predictions)

# For example, calculate the overall MSE for all tickers together
y_true = df['Close'].values
combined_mse = mean_squared_error(y_true, combined_predictions)
print(f"Overall Mean Squared Error: {combined_mse}")
