In [None]:
import pandas as pd
from datetime import datetime
from distutils.util import strtobool

def convert_tsf_to_dataframe(full_file_path_and_name, replace_missing_vals_with="NaN", value_column_name="series_value"):
    col_names = []
    col_types = []
    all_data = {}
    frequency = None
    forecast_horizon = None
    contain_missing_values = None
    contain_equal_length = None
    found_data_tag = False
    found_data_section = False
    started_reading_data_section = False

    with open(full_file_path_and_name, "r", encoding="cp1252") as file:
        for line in file:
            line = line.strip()
            if line:
                if line.startswith("@"):
                    if not line.startswith("@data"):
                        line_content = line.split(" ")
                        if line.startswith("@attribute"):
                            if len(line_content) != 3:
                                raise Exception("Invalid meta-data specification.")
                            col_names.append(line_content[1])
                            col_types.append(line_content[2])
                        else:
                            if len(line_content) != 2:
                                raise Exception("Invalid meta-data specification.")
                            if line.startswith("@frequency"):
                                frequency = line_content[1]
                            elif line.startswith("@horizon"):
                                forecast_horizon = int(line_content[1])
                            elif line.startswith("@missing"):
                                contain_missing_values = bool(strtobool(line_content[1]))
                            elif line.startswith("@equallength"):
                                contain_equal_length = bool(strtobool(line_content[1]))
                    else:
                        if len(col_names) == 0:
                            raise Exception("Missing attribute section. Attribute section must come before data.")
                        found_data_tag = True
                elif not line.startswith("#"):
                    if len(col_names) == 0 or not found_data_tag:
                        raise Exception("Missing attribute section or @data tag.")
                    else:
                        if not started_reading_data_section:
                            started_reading_data_section = True
                            found_data_section = True
                            for col in col_names:
                                all_data[col] = []

                        full_info = line.split(":")
                        if len(full_info) != (len(col_names) + 1):
                            raise Exception("Missing attributes/values in series.")

                        series = full_info[-1].split(",")
                        if not series:
                            raise Exception("A given series should contain at least one numeric value. Missing values should be indicated with ? symbol.")

                        series_data = [replace_missing_vals_with if val == "?" else float(val) for val in series]
                        if series_data.count(replace_missing_vals_with) == len(series_data):
                            raise Exception("All series values are missing.")

                        all_data.setdefault(value_column_name, []).append(series_data)

                        for i, val in enumerate(full_info[:-1]):
                            if col_types[i] == "numeric":
                                all_data[col_names[i]].append(int(val))
                            elif col_types[i] == "string":
                                all_data[col_names[i]].append(val)
                            elif col_types[i] == "date":
                                try:
                                    all_data[col_names[i]].append(datetime.strptime(val, "%Y-%m-%d %H-%M-%S"))
                                except ValueError as e:
                                    print(f"Error parsing date: {val}. Expected format: '%Y-%m-%d %H-%M-%S'. Error: {e}")
                                    raise
                            else:
                                raise Exception("Unsupported attribute type.")
    if not all_data:
        raise Exception("No data loaded. Check file content and format.")

    # Create DataFrame from dictionary
    loaded_data = pd.DataFrame(all_data)
    return loaded_data, frequency, forecast_horizon, contain_missing_values, contain_equal_length



In [None]:
import os
# Example of usage
full_file_path_and_name = ''
loaded_data, frequency, forecast_horizon, contain_missing_values, contain_equal_length = convert_tsf_to_dataframe(full_file_path_and_name)
output_file_path = " "
# output_file_path = " "

# Create the directory if it does not exist
if not os.path.exists(output_file_path):
    os.makedirs(output_file_path)

# Saving the embedded series DataFrame to a CSV file
loaded_data.to_csv(f'{output_file_path}_loaded_dat.csv', index=False)

print(loaded_data)
print(frequency)
print(forecast_horizon)
print(contain_missing_values)
print(contain_equal_length)

In [None]:
import numpy as np
import pandas as pd

def create_input_matrix(dataset, lag):
    embedded_series = []
    final_lags = []
    series_means = []
    
    for i, series in enumerate(dataset['series_value']):
        print(f"Processing series {i + 1}/{len(dataset)}")
        time_series = np.array(series)

        if len(time_series) < lag + 1:
            print(f"Skipping series {i + 1} due to insufficient length.")
            continue
        
        mean_val = np.mean(time_series)
        
        # Mean normalization
        if mean_val == 0:
            mean_val = 1  # Avoid division by zero
        
        time_series_normalized = time_series / mean_val
        series_means.append(mean_val)
        
        # Prepare lagged data only if enough data points are available
        try:
            slices = []
            for j in range(lag + 1):
                if len(time_series_normalized[j:]) > lag:  # Ensure slice contains data
                    slices.append(time_series_normalized[j:-lag+j if -lag+j != 0 else None])
            if slices:
                embedded = np.column_stack(slices)
                embedded_series.append(embedded)
        except ValueError as e:
            print(f"Error in processing series {i + 1}: {e}")
            continue
        
        # Creating the test set
        final_lags.append(time_series_normalized[-lag:][::-1])
    
    if embedded_series:
        embedded_series_df = pd.DataFrame(np.vstack(embedded_series), columns=["y"] + [f"Lag{j}" for j in range(1, lag+1)])
    else:
        embedded_series_df = pd.DataFrame(columns=["y"] + [f"Lag{j}" for j in range(1, lag+1)])
    
    if final_lags:
        final_lags_df = pd.DataFrame(final_lags, columns=[f"Lag{j}" for j in range(1, lag+1)])
    else:
        final_lags_df = pd.DataFrame(columns=[f"Lag{j}" for j in range(1, lag+1)])
    
    return embedded_series_df, final_lags_df, series_means

In [None]:
import os

embedded_series_df, final_lags_df, series_means = create_input_matrix(loaded_data, lag=10)

output_file_path = "/home/predict_price/stock_price/data_save/Direction/ver4/ver4_Volatility_Signal_seq_5_split_0.7_US_list_2001"
# output_file_path = "/home/predict_price/stock_price/data_save/daily/ver5.5/test/test"

# Create the directory if it does not exist
if not os.path.exists(output_file_path):
    os.makedirs(output_file_path)

# Saving the embedded series DataFrame to a CSV file
embedded_series_df.to_csv(f'{output_file_path}_embedded_series_df.csv', index=False)
# Saving the final lags DataFrame to a CSV file
final_lags_df.to_csv(f'{output_file_path}_final_lags_df.csv', index=False)

In [None]:
# def prepare_symbol_data(df, num_lags):
#     all_features = []
#     all_labels = []
#     scale_params = {}

#     for symbol in df['Symbol']. unique():
#         symbol_data = df[df['Symbol'] == symbol]
#         symbol_data = symbol_data.sort_values('Date')
#         symbol_data.set_index('Date', inplace=True)
        
#         normalized_data, mean, std = normalize_data(symbol_data.drop(columns=['Symbol']))
#         scale_params[symbol] = (mean, std)  # Store scaling parameters

#         lagged_data = create_lagged_rows(normalized_data, num_lags)
        
#         features = lagged_data.iloc[:, :-1]  # All but the last column as features
#         labels = lagged_data.iloc[:, -1]  # Last column as labels
        
#         all_features.append(features)
#         all_labels.append(labels)

#     all_features = pd.concat(all_features)
#     all_labels = pd.concat(all_labels)

#     return all_features, all_labels, scale_params

In [None]:
# import pandas as pd
# import numpy as np

# # Generate example data
# np.random.seed(0)
# dates = pd.date_range('20200101', periods=1000)  # Two months of data
# data = pd.DataFrame({
#     'Symbol': ['A']*500 + ['B']*500,
#     'Date': dates[:500].tolist() + dates[:500].tolist(),  # Repeating the same date range for simplicity
#     'Open': np.random.rand(1000) * 100,
#     'High': np.random.rand(1000) * 100,
#     'Low': np.random.rand(1000) * 100,
#     'Close': np.random.rand(1000) * 100,
#     'Volume': np.random.randint(1000, 10000, size=1000)
# })

# # Function to create lagged features for multiple columns
# def create_lagged_features(data, num_lags, columns):
#     df = data.copy()
#     for col in columns:
#         for lag in range(1, num_lags + 1):
#             df[f'{col}_lag_{lag}'] = df[col].shift(lag)
#     return df.dropna()  # Dropping rows to remove NA values from lagging

# # Columns to lag
# columns_to_lag = ['Open', 'High', 'Low', 'Close', 'Volume']

# # Prepare data by symbol
# grouped = data.groupby('Symbol')
# prepared_data = [create_lagged_features(group, num_lags=20, columns=columns_to_lag) for _, group in grouped]

# # Combine all symbols back into a single DataFrame
# final_data = pd.concat(prepared_data)

# # Print the head of the final dataset to see some of the transformed data
# print(final_data)
