In [2]:
import os
import numpy as np
import torch
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

def create_sequences(data, seq_len):
    sequences = []
    targets = []
    for i in range(len(data) - seq_len):
        seq = data[i:i+seq_len]
        target = data[i+seq_len:i+seq_len+1]
        sequences.append(seq)
        targets.append(target)
    return np.array(sequences), np.array(targets)

def prepare_and_save_data(data, dep_var, drop_columns, output_file_path, test_size=0.2, random_state=42, seq_len=10):
    # Ensure the output directory exists
    if not os.path.exists(output_file_path):
        os.makedirs(output_file_path, exist_ok=True)

    # Drop specified columns
    if drop_columns:
        data = data.drop(columns=drop_columns)
    
    # Create sequences
    X, y = create_sequences(data, seq_len)
    
    # Split data into training+validation sets and test set
    X_train_valid, X_test, y_train_valid, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)

    # Further split training and validation sets
    X_train, X_valid, y_train, y_valid = train_test_split(X_train_valid, y_train_valid, test_size=test_size, random_state=random_state)

    # Scale features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train.reshape(-1, X_train.shape[-1])).reshape(X_train.shape)
    X_valid_scaled = scaler.transform(X_valid.reshape(-1, X_valid.shape[-1])).reshape(X_valid.shape)
    X_test_scaled = scaler.transform(X_test.reshape(-1, X_test.shape[-1])).reshape(X_test.shape)

    # Convert numpy arrays to tensors
    X_train_tensor = torch.tensor(X_train_scaled, dtype=torch.float)
    y_train_tensor = torch.tensor(y_train, dtype=torch.float)
    X_valid_tensor = torch.tensor(X_valid_scaled, dtype=torch.float)
    y_valid_tensor = torch.tensor(y_valid, dtype=torch.float)
    X_test_tensor = torch.tensor(X_test_scaled, dtype=torch.float)
    y_test_tensor = torch.tensor(y_test, dtype=torch.float)

    return X_train_tensor, y_train_tensor, X_valid_tensor, y_valid_tensor, X_test_tensor, y_test_tensor

import torch
import torch.nn as nn

class LSTMModel(nn.Module):
    """
    Long Short-Term Memory (LSTM) model class.
    
    This class defines an LSTM architecture with multiple LSTM layers 
    and activation functions between each layer. Dropout is also applied
    between LSTM layers for regularization. L1 and L2 regularization 
    are supported on the weights.
    """
    def __init__(self, input_size, hidden_size, num_layers=2, dropout_rate=0.1, activation_function=torch.nn.ReLU(), 
                 l1_regularization=0, l2_regularization=0):
        super(LSTMModel, self).__init__()

        self.activation_function = activation_function
        self.l1_regularization = l1_regularization
        self.l2_regularization = l2_regularization

        # Build the LSTM architecture
        self.lstm_layers = nn.ModuleList()
        self.lstm_layers.append(nn.LSTM(input_size, hidden_size, num_layers=num_layers, dropout=dropout_rate, batch_first=True))
        for _ in range(num_layers - 1):
            self.lstm_layers.append(nn.LSTM(hidden_size, hidden_size, num_layers=num_layers, dropout=dropout_rate, batch_first=True))
        
        # Final fully connected layer
        self.fc = nn.Linear(hidden_size, 1)

        # Initialize weights
        self.init_weights()

    def init_weights(self):
        """Initialize weights of layers using Xavier initialization."""

        for layer in self.lstm_layers:
            for param in layer.parameters():
                if len(param.shape) >= 2:
                    nn.init.xavier_uniform_(param)
                else:
                    nn.init.zeros_(param)

        nn.init.xavier_uniform_(self.fc.weight)
        nn.init.zeros_(self.fc.bias)

    def regularization_loss(self):
        """Calculate regularization loss from model weights."""

        l1_loss = 0
        l2_loss = 0

        for param in self.parameters():
            l1_loss += torch.norm(param, 1)
            l2_loss += torch.norm(param, 2) ** 2

        return self.l1_regularization * l1_loss + self.l2_regularization * l2_loss

    def forward(self, x):
        """Forward pass for the LSTM model."""

        for lstm_layer in self.lstm_layers:
            x, _ = lstm_layer(x)

        x = self.fc(x[:, -1, :])  # Taking the last output of the last LSTM layer

        return x.squeeze(1)  # Squeeze to remove the singleton dimension

import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from torch.cuda.amp import autocast, GradScaler
import os

def train_LSTM_model(model, X_train, y_train, X_valid, y_valid, n_epochs, optimizer=torch.optim.Adam, criterion=torch.nn.MSELoss, batch_size=32, learning_rate=1e-3, 
                    patience=10, min_delta=0.0001, max_norm=1.0, num_workers=0, pin_memory=False, validation_frequency=1, load_directory=None, save_directory=None, trial=1):

    """
    Trains a Long Short-Term Memory (LSTM) model using the specified parameters and data, implementing early stopping and model saving.

    Parameters:
    - model: The LSTM model to train.
    - X_train, y_train: Training data and labels.
    - X_valid, y_valid: Validation data and labels.
    - n_epochs: Number of epochs to train the model.
    - optimizer: The optimization algorithm (default: torch.optim.Adam).
    - criterion: Loss function (default: torch.nn.MSELoss).
    - batch_size: Size of batches for training and validation (default: 32).
    - learning_rate: Learning rate for the optimizer (default: 1e-3).
    - patience: Number of epochs with no improvement on validation loss to wait before stopping training early (default: 10).
    - min_delta: Minimum change in validation loss to qualify as an improvement (default: 0.0001).
    - max_norm: Maximum norm for gradient clipping (default: 1.0).
    - num_workers: Number of subprocesses to use for data loading (default: 0).
    - pin_memory: If True, the data loader will copy tensors into CUDA pinned memory before returning them (default: False).
    - validation_frequency: Frequency of validation in terms of number of epochs (default: 1).
    - load_directory: Directory from which to load the model if resuming training (default: None).
    - save_directory: Directory where to save the model and plots (default: None).
    - trial: Identifier for the training trial (default: 1).

    Returns:
    A dictionary containing:
    - 'train_losses': List of average training losses per epoch.
    - 'val_losses': List of average validation losses per validation cycle.
    - 'best_val_loss': Best validation loss achieved during training.
    - 'early_stopping_counter': Counter indicating the number of consecutive epochs without improvement in validation loss.
    """
    
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)
    
    # Convert numpy arrays to PyTorch tensors
    X_train, y_train = torch.tensor(X_train, device=device).float(), torch.tensor(y_train, device=device).float()
    X_valid, y_valid = torch.tensor(X_valid, device=device).float(), torch.tensor(y_valid, device=device).float()

    # Create DataLoader for both training and validation sets
    train_dataset = TensorDataset(X_train, y_train)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=num_workers, pin_memory=pin_memory)
    valid_dataset = TensorDataset(X_valid, y_valid)
    valid_loader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False, num_workers=num_workers, pin_memory=pin_memory)

    optimizer = optimizer(model.parameters(), lr=learning_rate)
    criterion = criterion().to(device)
    scaler = GradScaler() # Enables mixed-precision training for faster computation.

    best_val_loss = float('inf')
    early_stopping_counter = 0

    # Lists to hold average losses per epoch
    train_losses = []
    val_losses = []

    for epoch in range(n_epochs):
        model.train()
        total_train_loss = 0

        for X_batch, y_batch in train_loader:
            optimizer.zero_grad() # Reset gradients to zero for a new optimization step.
            with autocast(): # Enable mixed-precision training.
                y_pred = model(X_batch)
                loss = criterion(y_pred, y_batch) # Calculate loss.
                reg_loss = model.regularization_loss() # Calculate regularization loss.
                loss += reg_loss # Combine main loss and regularization loss.
            scaler.scale(loss).backward() # Scale loss to prevent underflow in mixed precision.
            scaler.unscale_(optimizer) # Unscale gradients before clipping.
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm) # Clip gradients to prevent explosion.
            scaler.step(optimizer) # Perform optimization step.
            scaler.update() # Update scaler for next iteration.

            total_train_loss += loss.item() # Accumulate the loss.

        avg_train_loss = total_train_loss / len(train_loader) # Calculate average training loss for the epoch.
        train_losses.append(avg_train_loss)  # Store it for later analysis.

    # Perform validation at the specified frequency.
        if epoch % validation_frequency == 0:
            model.eval() # Set the model to evaluation mode.
            total_val_loss = 0
            with torch.no_grad():  # Disable gradient computation.
                for X_batch, y_batch in valid_loader:
                    y_pred = model(X_batch)
                    loss = criterion(y_pred, y_batch) # Calculate validation loss.
                    total_val_loss += loss.item() # Accumulate validation loss.

            avg_val_loss = total_val_loss / len(valid_loader) # Calculate average validation loss for this cycle.
            val_losses.append(avg_val_loss)  # Store it for later analysis.
            
            print(f"Epoch {epoch}: Train Loss = {avg_train_loss:.5f}, Val Loss = {avg_val_loss:.5f}")

            # Early stopping logic.
            if avg_val_loss < best_val_loss - min_delta:
                best_val_loss = avg_val_loss
                early_stopping_counter = 0
                if save_directory:
                    os.makedirs(load_directory, exist_ok=True)
                    save_path = os.path.join(load_directory, f"LSTM_model_trial_{trial}.pt")
                    torch.save(model, save_path) # Save the model if there's an improvement.
            else:
                early_stopping_counter += 1
                if early_stopping_counter >= patience:
                    print(f"Early stopping at epoch {epoch} due to no improvement in validation loss.")
                    break # Exit the training loop if no improvement for 'patience' epochs.

    return {
        "train_losses": train_losses,
        "val_losses": val_losses,
        "best_val_loss": best_val_loss,
        "early_stopping_counter": early_stopping_counter
    }

import os
import timeit
import random
import pandas as pd
import torch
from torch.utils.data import DataLoader, TensorDataset

def model_save_LSTM(X_train, y_train, X_valid, y_valid, n_trials=1, load_directory=None, save_directory=None, plot_loss=True):
    
    """
    Performs multiple trials of training a Long Short-Term Memory (LSTM) model with randomly selected hyperparameters,
    saves the trained models, and optionally plots the training and validation loss curves.

    Parameters:
    - X_train, y_train: Training data and labels.
    - X_valid, y_valid: Validation data and labels.
    - n_trials (int): Number of training trials to execute with random hyperparameters.
    - load_directory (str, optional): Directory to load existing models from (not used in current implementation).
    - save_directory (str, optional): Directory to save trained models and results.
    - plot_loss (bool): If True, plots the training and validation loss curves after each trial.

    Returns:
    - all_results_params_df (pandas.DataFrame): DataFrame containing the parameters and results from all trials
      if save_directory is specified. Otherwise, returns a list of dictionaries with the same information.
    """
    if save_directory and not os.path.exists(save_directory):
        os.makedirs(save_directory)

    all_results_params = []

    for trial in range(n_trials):
        print(f"Trial {trial + 1} of {n_trials}")

        start = timeit.default_timer()

        # Randomly generate hyperparameters
        hidden_size = random.choice(range(16, 128))
        num_layers = random.choice(range(2, 6))
        dropout_rate = random.choice([0.1, 0.2, 0.3])
        batch_size = random.choice(range(16, 128))
        learning_rate = 0.0001 * random.choice(range(1, 16))
        activation_function = random.choice([torch.nn.ReLU(), torch.nn.LeakyReLU(negative_slope=0.01)])
        optimizer = random.choice([torch.optim.Adam, torch.optim.AdamW])
        criterion = random.choice([torch.nn.SmoothL1Loss])
        n_epochs = random.choice(range(100, 201))
        patience = random.choice(range(5, 11))
        min_delta = 0.0001 * random.choice(range(1, 2))
        l1_regularization = 0.00000001 * random.choice(range(1, 2))
        l2_regularization = l1_regularization

        # Define the LSTM model
        model = LSTMModel(input_size=X_train.shape[2], hidden_size=hidden_size, num_layers=num_layers,
                          dropout_rate=dropout_rate, activation_function=activation_function,
                          l1_regularization=l1_regularization, l2_regularization=l2_regularization)

        # Train the LSTM model
        training_results = train_LSTM_model(model, X_train, y_train, X_valid, y_valid, n_epochs, 
                                            batch_size=batch_size, learning_rate=learning_rate,
                                            patience=patience, min_delta=min_delta, optimizer=optimizer, criterion=criterion, load_directory=load_directory,
                                            save_directory=save_directory, trial=trial)

        train_losses, val_losses = training_results['train_losses'], training_results['val_losses']
        
        # Optionally plot the loss curves
        if plot_loss:
            plot_results(train_losses, val_losses, trial, save_directory=save_directory)

        # Collect results and parameters
        params = {
            "hidden_size": hidden_size,
            "num_layers": num_layers,
            "dropout_rate": dropout_rate,
            "batch_size": batch_size,
            "learning_rate": learning_rate,
            "activation_function": activation_function.__class__.__name__,
            "optimizer": optimizer.__name__,
            "criterion": criterion.__name__,
            "n_epochs": n_epochs,
            "patience": patience,
            "min_delta": min_delta,
            "l1_regularization": l1_regularization,
            "l2_regularization": l2_regularization
        }

        results_params = {**params, "trial": trial + 1, "train_loss": train_losses[-1], "val_loss": val_losses[-1]}
        all_results_params.append(results_params)

        # Save the results to a CSV file if a save directory is specified
        if save_directory:
            all_results_params_df = pd.DataFrame(all_results_params)
            all_results_params_df.to_csv(os.path.join(save_directory, "all_results_params.csv"), index=False)

        end = timeit.default_timer()
        print(f"Execution Time for Trial {trial + 1}: {end - start} seconds")

    return all_results_params_df if save_directory else all_results_params

import os
import timeit
import random
import pandas as pd
import torch
from torch.utils.data import DataLoader, TensorDataset

def evaluate_LSTM_model_and_save_results(model, X_test, y_test, load_directory=None, save_directory=None):
    """
    Evaluates a trained Long Short-Term Memory (LSTM) model on test data and saves the evaluation results.

    Parameters:
    - model: The trained LSTM model to evaluate.
    - X_test, y_test: Test data and labels.
    - load_directory (str, optional): Directory to load the trained model from.
    - save_directory (str, optional): Directory to save the evaluation results.

    Returns:
    - eval_results (dict): Dictionary containing evaluation metrics.
    """

    # Load the trained model if load_directory is specified
    if load_directory:
        model_path = os.path.join(load_directory, "best_model.pt")
        model = torch.load(model_path)

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)

    # Convert test data to PyTorch tensors
    X_test, y_test = torch.tensor(X_test, device=device).float(), torch.tensor(y_test, device=device).float()

    # Create DataLoader for test set
    test_dataset = TensorDataset(X_test, y_test)
    test_loader = DataLoader(test_dataset, batch_size=1, shuffle=False)

    criterion = torch.nn.MSELoss()

    model.eval()  # Set the model to evaluation mode
    total_loss = 0.0

    with torch.no_grad():
        for X_batch, y_batch in test_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            y_pred = model(X_batch)
            loss = criterion(y_pred, y_batch)
            total_loss += loss.item()

    avg_loss = total_loss / len(test_loader)

    # Prepare evaluation results
    eval_results = {
        "test_loss": avg_loss
    }

    # Save evaluation results if save_directory is specified
    if save_directory:
        os.makedirs(save_directory, exist_ok=True)
        eval_results_path = os.path.join(save_directory, "evaluation_results.csv")
        eval_results_df = pd.DataFrame(eval_results, index=[0])
        eval_results_df.to_csv(eval_results_path, index=False)

    return eval_results

import os
import pandas as pd

def execute_and_evaluate_LSTM_models(data, dep_var, drop_columns, n_trials, top_k_models, save_directory, load_directory):
    """
    Executes a series of steps to train, evaluate, and select the top LSTM models based on a dataset. 
    It involves data preparation, model training with random hyperparameters for a specified number of trials,
    evaluation of these models on a test set, and selection of the top performing models based on Mean Absolute Error (MAE).

    Parameters:
    - data (pandas.DataFrame): The dataset containing features and the target variable.
    - dep_var (str): The name of the target variable in the dataset.
    - drop_columns (list of str): A list of column names to be dropped from the dataset before training.
    - n_trials (int): The number of training trials to conduct with randomly selected hyperparameters.
    - top_k_models (int): The number of top models to select based on their performance metric.
    - save_directory (str): The directory where to save training results, model files, and evaluation results.
    - load_directory (str): The directory from which to load trained model files for evaluation.

    Returns:
    - top_models_df (pandas.DataFrame): A DataFrame containing the evaluation results of the top k models selected based on MAE.

    This function orchestrates the workflow from data preprocessing, model training, model evaluation, 
    to the selection of the top performing models. It automates the process of experimenting with multiple 
    LSTM architectures and hyperparameters to identify models that achieve the best performance on the given dataset.
    """
    
    if not os.path.exists(save_directory):
        os.makedirs(save_directory, exist_ok=True)  # Ensure the save directory exists.
        
    # Prepare data and split into training, validation, and test sets, then save the processed data.
    X_train, y_train, X_valid, y_valid, X_test, y_test = prepare_and_save_data(data, dep_var, drop_columns, test_size=0.2, random_state=42)
    
    # Train models with randomly generated hyperparameters and save the training results and plots.
    model_save_LSTM(X_train, y_train, X_valid, y_valid, n_trials=n_trials, load_directory=load_directory, save_directory=save_directory, plot_loss=True)
    
    # Evaluate the trained models on the test set, calculate performance metrics, and save the evaluation results.
    evaluate_results_df = evaluate_LSTM_model_and_save_results(X_test, y_test, load_directory=load_directory, save_directory=save_directory, results_file_name='evaluation_results.csv')
    
    # Select the top k models based on the lowest MAE score and print the results.
    if 'MAE' in evaluate_results_df.columns:
        top_models_df = evaluate_results_df.nsmallest(top_k_models, 'MAE')
    else:
        print("MAE column not found in evaluate_results_df.")
    
    print("Top models based on MAE score:")
    print(top_models_df)
    
    # Save the evaluation results of the top models for further analysis.
    top_models_df.to_csv(os.path.join(save_directory, "top_models_evaluation.csv"), index=False)
    
    return top_models_df



In [16]:
import pandas as pd

# Define the path to the TSF file and the output CSV file
input_file_path = r'C:\Users\young78703\Documents\GitHub\Machine-Learning-Projects\Data\rideshare_dataset_without_missing_values.tsf'
output_csv_path = r'C:\Users\young78703\Documents\GitHub\Machine-Learning-Projects\Data\Time_Series\output.csv'

# Open the TSF file
with open(input_file_path, 'r') as file:
    lines = file.readlines()

# Find the index where actual data starts
data_start_index = 0
for index, line in enumerate(lines):
    if not line.startswith('#') and not line.startswith('@'):
        data_start_index = index
        break

# Extract data lines and remove any leading/trailing whitespace
data_lines = [line.strip() for line in lines[data_start_index:] if line.strip()]

# Assuming the data is tab-separated, you can adjust this if needed
df = pd.DataFrame([line.split('\t') for line in data_lines])

# Save to CSV
df.to_csv(output_csv_path, index=False)

print("Data has been saved to CSV.")

Data has been saved to CSV.


In [17]:
df.head()

Unnamed: 0,0
0,T0:Back Bay:Lyft:Lux:price_min:2018-11-26 06-0...
1,T1:Back Bay:Lyft:Lux:price_mean:2018-11-26 06-...
2,T2:Back Bay:Lyft:Lux:price_max:2018-11-26 06-0...
3,T3:Back Bay:Lyft:Lux:distance_min:2018-11-26 0...
4,T4:Back Bay:Lyft:Lux:distance_mean:2018-11-26 ...


In [11]:
def read_tsf_to_csv(input_path, output_path):
    start_reading = False
    data = []
    with open(input_path, 'r') as file:
        for line in file:
            if start_reading:
                if line.strip():  # Collect non-empty lines only
                    data.append(line.strip().split(','))  # Assume comma-separated values
            elif '@data' in line:
                start_reading = True  # Start reading after '@data'

    # Assume the file contains headers in the format described before '@data'
    headers = ['series_name', 'source_location', 'provider_name', 'provider_service', 'type', 
               'start_timestamp', 'price_min', 'price_mean', 'price_max', 'distance_min', 
               'distance_mean', 'distance_max', 'surge_min', 'surge_mean', 'surge_max', 
               'api_calls', 'temp', 'rain', 'humidity', 'clouds', 'wind']
    
    # Create a DataFrame
    df = pd.DataFrame(data, columns=headers)
    
    # Save to CSV
    df.to_csv(output_path, index=False)

    print("Data has been successfully saved to CSV.")

In [12]:
input_file_path = r'C:\Users\young78703\Documents\GitHub\Machine-Learning-Projects\Data\rideshare_dataset_without_missing_values.tsf'
output_csv_path = r'C:\Users\young78703\Documents\GitHub\Machine-Learning-Projects\Data\Time_Series\output.csv'
read_tsf_to_csv(input_file_path, output_csv_path)

ValueError: 21 columns passed, passed data had 541 columns

In [13]:
import pandas as pd

def read_tsf_to_csv(input_path, output_path):
    with open(input_path, 'r') as file:
        # Skip until @data
        for line in file:
            if '@data' in line:
                break
        # Read actual data
        data = [line.strip().split(',') for line in file if line.strip()]  # Adjust split based on actual delimiter

    # Check the number of fields in the first row
    print("Number of fields in the first data row:", len(data[0]))

    # Create a DataFrame with the appropriate number of columns
    # Example with a dynamic number of columns
    columns = [f'column_{i+1}' for i in range(len(data[0]))]  # Create dynamic column names based on data
    df = pd.DataFrame(data, columns=columns)

    # Save to CSV
    df.to_csv(output_path, index=False)
    print("Data has been successfully saved to CSV.")

In [14]:
input_file_path = r'C:\Users\young78703\Documents\GitHub\Machine-Learning-Projects\Data\rideshare_dataset_without_missing_values.tsf'
output_csv_path = r'C:\Users\young78703\Documents\GitHub\Machine-Learning-Projects\Data\Time_Series\output.csv'
read_tsf_to_csv(input_file_path, output_csv_path)

Number of fields in the first data row: 541
Data has been successfully saved to CSV.


In [15]:
df.head()

Unnamed: 0,0
0,T0:Back Bay:Lyft:Lux:price_min:2018-11-26 06-0...
1,T1:Back Bay:Lyft:Lux:price_mean:2018-11-26 06-...
2,T2:Back Bay:Lyft:Lux:price_max:2018-11-26 06-0...
3,T3:Back Bay:Lyft:Lux:distance_min:2018-11-26 0...
4,T4:Back Bay:Lyft:Lux:distance_mean:2018-11-26 ...


In [22]:
import pandas as pd

def read_tsf_file(file_path, encoding='utf-8'):
    metadata = {}
    data_started = False
    data = []
    
    with open(file_path, 'r', encoding=encoding) as file:
        for line in file:
            line = line.strip()
            if not data_started:
                if line == '@data':
                    data_started = True
                elif line.startswith('@'):
                    key, value = line[1:].split(' ', 1)
                    metadata[key] = value
            else:
                if line:
                    data.append(line.split(','))  # Adjust the split as needed based on the actual delimiter

    # Create a DataFrame without predefined columns to avoid errors
    # Check if all rows have the same number of columns
    max_cols = max(len(row) for row in data)
    min_cols = min(len(row) for row in data)

    if max_cols != min_cols:
        print(f"Warning: Data rows vary in length from {min_cols} to {max_cols} columns.")
    
    columns = [f'column_{i+1}' for i in range(max_cols)]  # Generate column names dynamically
    df = pd.DataFrame(data, columns=columns)
    
    return df, metadata

In [23]:

# Example usage
tsf_path = 'C:\\Users\\young78703\\Documents\\GitHub\\Machine-Learning-Projects\\Time_Series_Data\\m4_hourly_dataset.tsf'
try:
    dataframe, tsf_metadata = read_tsf_file(tsf_path)
except UnicodeDecodeError:
    # If UTF-8 decoding fails, try another encoding
    dataframe, tsf_metadata = read_tsf_file(tsf_path, encoding='ISO-8859-1')

print(tsf_metadata)
print(dataframe.head())

{'relation': 'M4', 'attribute': 'start_timestamp date', 'frequency': 'hourly', 'horizon': '48', 'missing': 'false', 'equallength': 'false'}
                      column_1 column_2 column_3 column_4 column_5 column_6  \
0   T1:2015-07-01 12-00-00:605      586      586      559      511      443   
1  T2:2015-07-01 12-00-00:3124     2990     2862     2809     2544     2201   
2  T3:2015-07-01 12-00-00:1828     1806     1897     1750     1679     1620   
3  T4:2015-07-01 12-00-00:6454     6324     6075     5949     5858     5579   
4  T5:2015-07-01 12-00-00:4263     4297     4236     4080     3883     3672   

  column_7 column_8 column_9 column_10  ... column_999 column_1000  \
0      422      395      382       370  ...       None        None   
1     1996     1861     1735      1713  ...       None        None   
2     1463     1342     1192      1108  ...       None        None   
3     5163     4790     4478      4227  ...       None        None   
4     3248     2841     2513      2