In [1]:
from datetime import datetime
from distutils.util import strtobool

import pandas as pd


# Converts the contents in a .tsf file into a dataframe and returns it along with other meta-data of the dataset: frequency, horizon, whether the dataset contains missing values and whether the series have equal lengths
#
# Parameters
# full_file_path_and_name - complete .tsf file path
# replace_missing_vals_with - a term to indicate the missing values in series in the returning dataframe
# value_column_name - Any name that is preferred to have as the name of the column containing series values in the returning dataframe

def convert_tsf_to_dataframe(
    full_file_path_and_name,
    replace_missing_vals_with="NaN",
    value_column_name="series_value",
):
    col_names = []
    col_types = []
    all_data = {}
    line_count = 0
    frequency = None
    forecast_horizon = None
    contain_missing_values = None
    contain_equal_length = None
    found_data_tag = False
    found_data_section = False
    started_reading_data_section = False

    with open(full_file_path_and_name, "r", encoding="cp1252") as file:
        for line in file:
            # Strip white space from start/end of line
            line = line.strip()

            if line:
                if line.startswith("@"):  # Read meta-data
                    if not line.startswith("@data"):
                        line_content = line.split(" ")
                        if line.startswith("@attribute"):
                            if (
                                len(line_content) != 3
                            ):  # Attributes have both name and type
                                raise Exception("Invalid meta-data specification.")

                            col_names.append(line_content[1])
                            col_types.append(line_content[2])
                        else:
                            if (
                                len(line_content) != 2
                            ):  # Other meta-data have only values
                                raise Exception("Invalid meta-data specification.")

                            if line.startswith("@frequency"):
                                frequency = line_content[1]
                            elif line.startswith("@horizon"):
                                forecast_horizon = int(line_content[1])
                            elif line.startswith("@missing"):
                                contain_missing_values = bool(
                                    strtobool(line_content[1])
                                )
                            elif line.startswith("@equallength"):
                                contain_equal_length = bool(strtobool(line_content[1]))

                    else:
                        if len(col_names) == 0:
                            raise Exception(
                                "Missing attribute section. Attribute section must come before data."
                            )

                        found_data_tag = True
                elif not line.startswith("#"):
                    if len(col_names) == 0:
                        raise Exception(
                            "Missing attribute section. Attribute section must come before data."
                        )
                    elif not found_data_tag:
                        raise Exception("Missing @data tag.")
                    else:
                        if not started_reading_data_section:
                            started_reading_data_section = True
                            found_data_section = True
                            all_series = []

                            for col in col_names:
                                all_data[col] = []

                        full_info = line.split(":")

                        if len(full_info) != (len(col_names) + 1):
                            raise Exception("Missing attributes/values in series.")

                        series = full_info[len(full_info) - 1]
                        series = series.split(",")

                        if len(series) == 0:
                            raise Exception(
                                "A given series should contains a set of comma separated numeric values. At least one numeric value should be there in a series. Missing values should be indicated with ? symbol"
                            )

                        numeric_series = []

                        for val in series:
                            if val == "?":
                                numeric_series.append(replace_missing_vals_with)
                            else:
                                numeric_series.append(float(val))

                        if numeric_series.count(replace_missing_vals_with) == len(
                            numeric_series
                        ):
                            raise Exception(
                                "All series values are missing. A given series should contains a set of comma separated numeric values. At least one numeric value should be there in a series."
                            )

                        all_series.append(pd.Series(numeric_series).array)

                        for i in range(len(col_names)):
                            att_val = None
                            if col_types[i] == "numeric":
                                att_val = int(full_info[i])
                            elif col_types[i] == "string":
                                att_val = str(full_info[i])
                            elif col_types[i] == "date":
                                att_val = datetime.strptime(
                                    full_info[i], "%Y-%m-%d %H-%M-%S"
                                )
                            else:
                                raise Exception(
                                    "Invalid attribute type."
                                )  # Currently, the code supports only numeric, string and date types. Extend this as required.

                            if att_val is None:
                                raise Exception("Invalid attribute value.")
                            else:
                                all_data[col_names[i]].append(att_val)

                line_count = line_count + 1

        if line_count == 0:
            raise Exception("Empty file.")
        if len(col_names) == 0:
            raise Exception("Missing attribute section.")
        if not found_data_section:
            raise Exception("Missing series information under data section.")

        all_data[value_column_name] = all_series
        loaded_data = pd.DataFrame(all_data)

        return (
            loaded_data,
            frequency,
            forecast_horizon,
            contain_missing_values,
            contain_equal_length,
        )




In [3]:
# Example of usage
loaded_data, frequency, forecast_horizon, contain_missing_values, contain_equal_length = convert_tsf_to_dataframe(r"C:\Users\young78703\Documents\GitHub\Machine-Learning-Projects\Time_Series_Data\m4_hourly_dataset.tsf")

print(loaded_data)
print(frequency)
print(forecast_horizon)
print(contain_missing_values)
print(contain_equal_length)

    series_name     start_timestamp  \
0            T1 2015-07-01 12:00:00   
1            T2 2015-07-01 12:00:00   
2            T3 2015-07-01 12:00:00   
3            T4 2015-07-01 12:00:00   
4            T5 2015-07-01 12:00:00   
..          ...                 ...   
409        T410 2017-01-01 12:00:00   
410        T411 2017-01-01 12:00:00   
411        T412 2017-01-01 12:00:00   
412        T413 2017-06-12 12:00:00   
413        T414 2017-04-27 12:00:00   

                                          series_value  
0    [605.0, 586.0, 586.0, 559.0, 511.0, 443.0, 422...  
1    [3124.0, 2990.0, 2862.0, 2809.0, 2544.0, 2201....  
2    [1828.0, 1806.0, 1897.0, 1750.0, 1679.0, 1620....  
3    [6454.0, 6324.0, 6075.0, 5949.0, 5858.0, 5579....  
4    [4263.0, 4297.0, 4236.0, 4080.0, 3883.0, 3672....  
..                                                 ...  
409  [153.0, 196.0, 163.0, 131.0, 82.0, 53.0, 57.0,...  
410  [24.0, 30.0, 22.0, 14.0, 19.0, 20.0, 24.0, 45....  
411  [19.0, 30.0, 

In [None]:
import numpy as np
import pandas as pd
import torch
from torch.utils.data import DataLoader, TensorDataset

def normalize_data(df):
    """ Normalize the dataframe using mean and standard deviation. """
    return (df - df.mean()) / df.std()

def create_sequences(data, sequence_length):
    """ Create sequences from the dataset. """
    sequences = [data[i:i + sequence_length] for i in range(len(data) - sequence_length + 1)]
    return np.array(sequences)

def prepare_symbol_data(df, sequence_length):
    """ Prepare data for each symbol. """
    all_sequences = []
    all_labels = []

    for symbol in df['Symbol'].unique():
        symbol_data = df[df['Symbol'] == symbol]
        symbol_data = symbol_data.sort_values('Date')
        symbol_data.set_index('Date', inplace=True)
        
        normalized_data = normalize_data(symbol_data.drop(columns=['Symbol']))
        
        sequences = create_sequences(normalized_data.drop(columns=['Close']), sequence_length)
        labels = create_sequences(normalized_data['Close'], sequence_length)
        
        all_sequences.append(sequences)
        all_labels.append(labels[:, -1])  # Assuming prediction of the closing price

    # Convert lists to numpy arrays
    all_sequences = np.concatenate(all_sequences)
    all_labels = np.concatenate(all_labels)

    return all_sequences, all_labels

def split_data(sequences, labels, train_frac=0.7, valid_frac=0.15):
    """ Split the data into train, validation, and test sets. """
    total_count = sequences.shape[0]
    train_size = int(total_count * train_frac)
    valid_size = int(total_count * valid_frac)
    test_size = total_count - train_size - valid_size

    indices = np.random.permutation(total_count)
    train_indices = indices[:train_size]
    valid_indices = indices[train_size:train_size + valid_size]
    test_indices = indices[train_size + valid_size:]

    X_train, y_train = sequences[train_indices], labels[train_indices]
    X_valid, y_valid = sequences[valid_indices], labels[valid_indices]
    X_test, y_test = sequences[test_indices], labels[test_indices]

    return X_train, y_train, X_valid, y_valid, X_test, y_test

def convert_to_tensors(*arrays):
    """ Convert arrays to PyTorch tensors. """
    return tuple(torch.tensor(array, dtype=torch.float32) for array in arrays)

def create_dataloaders(X_train, y_train, X_valid, y_valid, X_test, y_test, batch_size=32):
    """ Create DataLoader for train, validation, and test sets. """
    train_loader = DataLoader(TensorDataset(X_train, y_train), batch_size=batch_size, shuffle=True)
    valid_loader = DataLoader(TensorDataset(X_valid, y_valid), batch_size=batch_size, shuffle=False)
    test_loader = DataLoader(TensorDataset(X_test, y_test), batch_size=batch_size, shuffle=False)
    return train_loader, valid_loader, test_loader

# Example usage assuming 'df' is your DataFrame containing the stock data
sequence_length = 10
all_sequences, all_labels = prepare_symbol_data(df, sequence_length)

# Split and convert to tensors
X_train, y_train, X_valid, y_valid, X_test, y_test = split_data(all_sequences, all_labels)
X_train, y_train, X_valid, y_valid, X_test, y_test = convert_to_tensors(X_train, y_train, X_valid, y_valid, X_test, y_test)

# Create DataLoader
train_loader, valid_loader, test_loader = create_dataloaders(X_train, y_train, X_valid, y_valid, X_test, y_test)


In [None]:
import numpy as np
import pandas as pd
import torch
from torch.utils.data import DataLoader, TensorDataset

def normalize_data(df):
    """ Normalize the dataframe using mean and standard deviation, returns normalized data and scale parameters. """
    mean = df.mean()
    std = df.std()
    return (df - mean) / std, mean, std

def inverse_normalize_data(normalized_data, mean, std):
    """ Revert data back to its original scale using stored mean and std. """
    return normalized_data * std + mean

def create_sequences(data, sequence_length, padding_value=0):
    """ Optionally pad sequences to handle variable lengths. """
    sequences = [data[i:i + sequence_length] if len(data) >= sequence_length else np.pad(data[i:], (0, sequence_length-len(data[i:])), 'constant', constant_values=(padding_value,)) for i in range(len(data) - sequence_length + 1)]
    return np.array(sequences)

def prepare_symbol_data(df, sequence_length):
    all_sequences = []
    all_labels = []
    scale_params = {}

    for symbol in df['Symbol'].unique():
        symbol_data = df[df['Symbol'] == symbol]
        symbol_data = symbol_data.sort_values('Date')
        symbol_data.set_index('Date', inplace=True)
        
        normalized_data, mean, std = normalize_data(symbol_data.drop(columns=['Symbol']))
        scale_params[symbol] = (mean, std)  # Store scaling parameters
        
        sequences = create_sequences(normalized_data.drop(columns=['Close']), sequence_length)
        labels = create_sequences(normalized_data['Close'], sequence_length)
        
        all_sequences.append(sequences)
        all_labels.append(labels[:, -1])

    all_sequences = np.concatenate(all_sequences)
    all_labels = np.concatenate(all_labels)

    return all_sequences, all_labels, scale_params

def split_data(sequences, labels, train_frac=0.7, valid_frac=0.15):
    """ Split the data into train, validation, and test sets. """
    total_count = sequences.shape[0]
    train_size = int(total_count * train_frac)
    valid_size = int(total_count * valid_frac)
    test_size = total_count - train_size - valid_size

    indices = np.random.permutation(total_count)
    train_indices = indices[:train_size]
    valid_indices = indices[train_size:train_size + valid_size]
    test_indices = indices[train_size + valid_size:]

    X_train, y_train = sequences[train_indices], labels[train_indices]
    X_valid, y_valid = sequences[valid_indices], labels[valid_indices]
    X_test, y_test = sequences[test_indices], labels[test_indices]

    return X_train, y_train, X_valid, y_valid, X_test, y_test

def convert_to_tensors(*arrays):
    """ Convert arrays to PyTorch tensors. """
    return tuple(torch.tensor(array, dtype=torch.float32) for array in arrays)

def create_dataloaders(X_train, y_train, X_valid, y_valid, X_test, y_test, batch_size=32):
    """ Create DataLoader for train, validation, and test sets. """
    train_loader = DataLoader(TensorDataset(X_train, y_train), batch_size=batch_size, shuffle=True)
    valid_loader = DataLoader(TensorDataset(X_valid, y_valid), batch_size=batch_size, shuffle=False)
    test_loader = DataLoader(TensorDataset(X_test, y_test), batch_size=batch_size, shuffle=False)
    return train_loader, valid_loader, test_loader

# Example usage assuming 'df' is your DataFrame containing the stock data
sequence_length = 10
all_sequences, all_labels = prepare_symbol_data(df, sequence_length)

# Split and convert to tensors
X_train, y_train, X_valid, y_valid, X_test, y_test = split_data(all_sequences, all_labels)
X_train, y_train, X_valid, y_valid, X_test, y_test = convert_to_tensors(X_train, y_train, X_valid, y_valid, X_test, y_test)

# Create DataLoader
train_loader, valid_loader, test_loader = create_dataloaders(X_train, y_train, X_valid, y_valid, X_test, y_test)
