In [None]:
from sklearn.linear_model import (
    LogisticRegression,
    LinearRegression,
    Ridge,
    Lasso,
    ElasticNet,
)
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import (
    RandomForestClassifier,
    RandomForestRegressor,
    GradientBoostingClassifier,
    GradientBoostingRegressor,
    AdaBoostClassifier,
    AdaBoostRegressor,
)
from sklearn.svm import SVC, SVR
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.naive_bayes import MultinomialNB, BernoulliNB, GaussianNB
from xgboost import XGBRegressor

In [None]:
# Checking Performance of Linear Machine Learning Models: including prediction of new data

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.decomposition import PCA
import torch
import os
import timeit
import joblib

def predict_future_prices(model, data, scale_params_df):
    # Create a list of symbols in the original order
    original_order = scale_params_df['Symbol'].tolist()
    
    # Convert the scale_params DataFrame into a dictionary for easy lookup
    scale_params = scale_params_df.set_index('Symbol').to_dict(orient='index')

    # Initialize a list to hold predictions with the symbol as the key
    predictions = []

    if 'Symbol' in data.columns:
        # Group by 'Symbol' and predict for each group
        grouped = data.groupby('Symbol')
        for symbol, group_data in grouped:
            X = group_data.drop(columns=['Symbol'])
            if 'Target' in X.columns:
                X = X.drop(columns=['Target'])

            # Make predictions for the current group
            y_pred = model.predict(X)

            # Scale the predictions if scale parameters are provided
            if symbol in scale_params:
                params = scale_params[symbol]
                y_pred = (y_pred * params['Target_Std'] + params['Target_Mean']).item()
                predictions.append((symbol, y_pred))

    else:
        print("The 'Symbol' column is missing from the data.")
        return None

    # Order predictions to match the original order of symbols
    ordered_predictions = [{ 'Symbol': symbol, 'Prediction': next((pred for sym, pred in predictions if sym == symbol), None) } for symbol in original_order]

    return ordered_predictions

def split_data(X, y, train_frac=0.7, valid_frac=0.15, random_state=False):
    """ Split the data into train, validation, and test sets. """
    total_count = X.shape[0]
    train_size = int(total_count * train_frac)
    valid_size = int(total_count * valid_frac)
    # test_size = total_count - train_size - valid_size

    if random_state:

        indices = np.random.permutation(total_count)
        train_indices = indices[:train_size]
        valid_indices = indices[train_size:train_size + valid_size]
        test_indices = indices[train_size + valid_size:]

        X_train, y_train = X[train_indices], y[train_indices]
        X_valid, y_valid = X[valid_indices], y[valid_indices]
        X_test, y_test = X[test_indices], y[test_indices]

    else:
        X_train, y_train = X[:train_size], y[:train_size]
        X_valid, y_valid = X[train_size:train_size + valid_size], y[train_size:train_size + valid_size]
        X_test, y_test = X[train_size + valid_size:], y[train_size + valid_size:]

    return X_train, y_train, X_valid, y_valid, X_test, y_test

def check_model_performance_linear(ModelClasses, data, dependent_var, drop_columns=[], train_frac=0.7, valid_frac=0.15, scaler=MinMaxScaler(), 
                                   pca=5, random_state=True, output_file_path=None, save_directory=None):
    """
    Check the performance of multiple models on a dataset, and predict values for new data.

    Args:
        ModelClasses (list): A list of model classes to evaluate.
        data (pandas.DataFrame): The dataset to use for training and evaluation.
        dependent_var (str): The name of the dependent variable column.
        drop_columns (list, optional): A list of column names to drop from the independent variables. Defaults to [].
        train_frac (float, optional): The proportion of the data to use for training. Defaults to 0.7.
        valid_frac (float, optional): The proportion of the data to use for validation. Defaults to 0.15.
        scaler (object, optional): The scaler to use for scaling the independent variables. Defaults to MinMaxScaler().
        new_data (pandas.DataFrame, optional): New data for which to predict values. Defaults to None.
        random_state (int, optional): The random seed to use for splitting the data into training and testing sets. Defaults to None.
        output_file_path (str, optional): Path to the directory where output files are stored. Defaults to None.

    Returns:
        pandas.DataFrame: A dataframe with the performance metrics for each model, and predicted values for new data.
    """
    if save_directory and not os.path.exists(save_directory):
        os.makedirs(save_directory)

    if data is None:
        # # Load tensors
        # X_train = torch.load(os.path.join(output_file_path, 'X_train.pt'))
        # y_train = torch.load(os.path.join(output_file_path, 'y_train.pt'))
        # X_valid = torch.load(os.path.join(output_file_path, 'X_valid.pt'))
        # y_valid = torch.load(os.path.join(output_file_path, 'y_valid.pt'))
        # X_test = torch.load(os.path.join(output_file_path, 'X_test.pt'))
        # y_test = torch.load(os.path.join(output_file_path, 'y_test.pt'))

        X_train = pd.read_csv(os.path.join(output_file_path, 'X_train.csv'))
        y_train = pd.read_csv(os.path.join(output_file_path, 'y_train.csv'))
        X_valid = pd.read_csv(os.path.join(output_file_path, 'X_valid.csv'))
        y_valid = pd.read_csv(os.path.join(output_file_path, 'y_valid.csv'))
        X_test = pd.read_csv(os.path.join(output_file_path, 'X_test.csv'))
        y_test = pd.read_csv(os.path.join(output_file_path, 'y_test.csv'))

        # Demension Reduction using PCA (Prinscipal Component Analysis)
        if pca is not None:
            pca_transformer = PCA(n_components=pca)
            X_train = pca_transformer.fit_transform(X_train)
            X_valid = pca_transformer.transform(X_valid)
            X_test = pca_transformer.transform(X_test)
            
    else:
        # Define the independent and dependent variables
        X = data.drop([dependent_var] + drop_columns, axis=1)
        y = data[dependent_var]

        # Split the data into training, validation, and testing sets
        X_train, y_train, X_valid, y_valid, X_test, y_test = split_data(X, y, train_frac=train_frac, valid_frac=valid_frac, random_state=random_state)

        # Demension Reduction using PCA (Prinscipal Component Analysis)
        if pca is not None:
            pca_transformer = PCA(n_components=pca)
            X_train = pca_transformer.fit_transform(X_train)
            X_valid = pca_transformer.transform(X_valid)
            X_test = pca_transformer.transform(X_test)

        # Scale the independent variables
        X_train = scaler.fit_transform(X_train)
        X_valid = scaler.transform(X_valid)
        X_test = scaler.transform(X_test)

    # Initialize results list
    results = []
    results_future = []

    # Iterate over each model class
    for ModelClass in ModelClasses:

        start = timeit.default_timer()
        
        model_name = ModelClass.__name__
        
        # Initialize and fit the model
        model = ModelClass().fit(X_train, y_train)

        # Save the model
        if save_directory:
            model_filename = os.path.join(save_directory, f"{model_name}.joblib")
            joblib.dump(model, model_filename)
            print(f"Saved {model_name} model to {model_filename}")

        # Predictions on training set
        y_pred_train = model.predict(X_train)
        r2_train = r2_score(y_train, y_pred_train)
        mae_train = mean_absolute_error(y_train, y_pred_train)
        mse_train = mean_squared_error(y_train, y_pred_train)
        
        # Predictions on validation set
        y_pred_valid = model.predict(X_valid)
        r2_valid = r2_score(y_valid, y_pred_valid)
        mae_valid = mean_absolute_error(y_valid, y_pred_valid)
        mse_valid = mean_squared_error(y_valid, y_pred_valid)
        
        # Predictions on test set
        y_pred_test = model.predict(X_test)
        r2_test = r2_score(y_test, y_pred_test)
        mae_test = mean_absolute_error(y_test, y_pred_test)
        mse_test = mean_squared_error(y_test, y_pred_test)

        # Metrics calculations...        
        # Store results
        result=({
            "Model": model_name,
            "Train R^2": r2_train,
            "Train MAE": mae_train,
            "Train MSE": mse_train,
            "Validation R^2": r2_valid,
            "Validation MAE": mae_valid,
            "Validation MSE": mse_valid,
            "Test R^2": r2_test,
            "Test MAE": mae_test,
            "Test MSE": mse_test,
        })
        results.append(result)

        # Predict future prices
        # Load future data
        future_data = pd.read_csv(os.path.join(output_file_path, 'future_data.csv'))
        scale_params = pd.read_csv(os.path.join(output_file_path, 'scale_params.csv'))

        # Assume 'Symbol' is a non-numeric column that we want to exclude from transformations
        future_data_numeric = future_data.copy().drop(columns=['Symbol'])

        # Apply PCA transformations used on the training data
        # future_data_scaled = scaler.transform(future_data_numeric)
        if pca is not None:
            future_data_pca = pca_transformer.transform(future_data_numeric)
            future_data_pca = pd.DataFrame(future_data_pca, columns=future_data.columns)

            rescaled_predictions = predict_future_prices(model, future_data_pca, scale_params)

        else:
            rescaled_predictions = predict_future_prices(model, future_data, scale_params)
            
        rescaled_predictions_df = pd.DataFrame(rescaled_predictions)
        rescaled_predictions_df['Model'] = model_name
        results_future.append(rescaled_predictions_df)

        end = timeit.default_timer()
        # Calculate and print duration
        duration = end - start
        print(f"Execution Time of Symbol_{model_name} is: {duration} seconds")

    results = pd.DataFrame(results)

    # Combine all prediction dataframes into a single dataframe
    results_future_df = pd.concat(results_future, ignore_index=True)

    # Sort the dataframe by 'Symbol'
    original_order = future_data['Symbol'].unique()
    # Now apply the pivot to rearrange the predicted prices side by side for each model
    pivoted_results_future_df = results_future_df.pivot(index='Symbol', columns='Model', values='Prediction')
    # Reindex the pivot table to maintain the original order
    pivoted_results_future_df = pivoted_results_future_df.reindex(original_order)

    return results, pivoted_results_future_df


In [None]:
ModelClasses = [
    LinearRegression,
    Ridge,
    Lasso,
    DecisionTreeRegressor,
    # AdaBoostRegressor,
    KNeighborsRegressor,
    XGBRegressor,
    # RandomForestRegressor,
    # GradientBoostingRegressor,
    # SVR
]
train_frac=0.7
valid_frac=0.20
scaler=StandardScaler()
pca=None
random_state=True
time_series=False
new_data=None

output_file_path=r"C:\Users\young78703\Documents\GitHub\Machine-Learning-Projects\Data_save\ML_Regression\Time_Series_Lag\stock_SP_100_indicator_daily_05072024"

# check_model_performance_linear(ModelClasses, df, 'Price', drop_columns=['Suburb', 'Address','Type','Method', 'Bedroom2', 'SellerG','Date','Postcode', 'CouncilArea', 'Lattitude',
#    'Longitude', 'Regionname'])
save_directory = r'C:\Users\young78703\Documents\GitHub\Machine-Learning-Projects\output\ML_Regression\Time_Series_Lag\stock_SP_100_indicator_daily_05072024'
results, results_future = check_model_performance_linear(ModelClasses=ModelClasses, data=None, dependent_var=None, drop_columns=[], scaler=scaler, output_file_path=output_file_path, 
                                         pca=pca, random_state=random_state, save_directory=save_directory)
print(results)
print(results_future)

if not os.path.exists(save_directory):
    os.makedirs(save_directory)

results.to_csv(os.path.join(save_directory, 'Results.csv'))
results_future.to_csv(os.path.join(save_directory, 'Results_Futures.csv'))

In [None]:
    ModelClasses = [
    LinearRegression,
    Ridge,
    Lasso,
    DecisionTreeRegressor,
    RandomForestRegressor,
    GradientBoostingRegressor,
    AdaBoostRegressor,
    SVR,
    KNeighborsRegressor,
    XGBRegressor]
    
    hyperparameters = {
     'LinearRegression': {},
    'Ridge': {
        'alpha': [0.1, 1, 10]
    },
    'Lasso': {
        'alpha': [0.1, 1, 10]
    },
    'DecisionTreeRegressor': {
        'max_depth': [2, 4, 8],
        'min_samples_leaf': [1, 2, 5]
    },
    'RandomForestRegressor': {
        'n_estimators': [50, 100, 150],
        'max_depth': [2, 4, 8]
    },
    'GradientBoostingRegressor': {
        'n_estimators': [50, 100, 150],
        'learning_rate': [0.01, 0.1, 1],
        'max_depth': [2, 4]
    },
    'AdaBoostRegressor': {
        'n_estimators': [25, 50, 100],
        'learning_rate': [0.5, 1, 2]
    },
    'SVR': {
        'C': [0.1, 1, 10],
        'kernel': ['linear', 'rbf']
    },
    'KNeighborsRegressor': {
        'n_neighbors': [3, 5, 7],
        'weights': ['uniform', 'distance']
    },
    'XGBRegressor': {
        'n_estimators': [50, 100, 150],
        'learning_rate': [0.01, 0.1, 1],
        'max_depth': [2, 4, 6],
        'subsample': [0.5, 0.75, 1],
    }
}

In [None]:
returns= ML_Regression_models_with_GridSearch(ModelClasses, hyperparameters, df, 'Price', drop_columns=['Suburb', 'Address','Type','Method', 'Bedroom2', 'SellerG','Date','Postcode', 'CouncilArea', 'Lattitude',
   'Longitude', 'Regionname'],new_data=None, test_size=0.2, random_state=42, scaler=StandardScaler(), scoring = 'neg_mean_squared_error', cv=3, pca=None, save_model=False, save_dir=None)
print(returns)
returns.to_csv('ML_Linear_Model_Results.csv', index=False)