In [7]:
import pandas as pd
from tqdm import tqdm
import numpy as np
def multiple_ts_file_to_dfs(series_csv: str = "../../RDN/Load_Data/2009-2019-global-load.csv",
                            resolution: str = "15min",
                            value_name="Value",
                            format="long"):
    """
    Reads the input multiple ts file, and returns a tuple containing a list of the time series it consists
    of, along with their ids and timeseries ids. 

    Parameters
    ----------
    series_csv
        The file name of the csv to be read. It must be in the multiple ts form described in the documentation
    resolution
        The resolution of the dataset
    value_name
        The name of the value column of the returned dataframes

    Returns
    -------
    Tuple[List[List[pandas.DataFrame]], List[List[str]], List[List[str]]]
        A tuple with the list of lists of dataframes to be returned, the ids 
        of their components, and the timeseries ids. For example, if the function
        reads a file with 2 time series (with ids ts_1 and ts_2), and each one 
        consists of 3 components (with ids ts_1_1, ts_1_2, ts_1_3, ts_2_1, ts_2_2, ts_2_3),
        then the function will return:
        (res, id_l, ts_id_l), where:
        res = [[ts_1_comp_1, ts_1_comp_2, ts_1_comp_3], [ts_2_comp_1, ts_2_comp_2, ts_2_comp_3]]
        id_l = [[ts_1_1, ts_1_2, ts_1_3], [ts_2_1, ts_2_2, ts_2_3]]
        ts_id_l = [[ts_1, ts_1, ts_1], [ts_2, ts_2, ts_2]]
        All of the above lists of lists have the same number of lists and each sublist the same
        amount of elements as the sublist of any other list of lists in the corresponding location.
        This is true because each sublist corresponds to a times eries, and each element of this
        sublist corresponds to a component of this time series.
    """

    ts = pd.read_csv(series_csv,
                     sep=None,
                     header=0,
                     index_col=0,
                     engine='python')
        
    if format == "long":
        ts["Datetime"] = pd.to_datetime(ts["Datetime"])
    else:
        ts["Date"] = pd.to_datetime(ts["Date"])


    res = []
    id_l = []
    ts_id_l = []
    ts_ids = list(np.unique(ts["Timeseries ID"]))
    first = True
    print("\nTurning multiple ts file to dataframe list...")
    for ts_id in tqdm(ts_ids):
        curr_ts = ts[ts["Timeseries ID"] == ts_id]
        ids = list(np.unique(curr_ts["ID"]))
        res.append([])
        id_l.append([])
        ts_id_l.append([])
        for id in ids:
            curr_comp = curr_ts[curr_ts["ID"] == id]
            if format == 'short':
                curr_comp = pd.melt(curr_comp, id_vars=['Date', 'ID', 'Timeseries ID'], var_name='Time', value_name=value_name)
                curr_comp["Datetime"] = pd.to_datetime(curr_comp['Date'].dt.strftime("%Y-%m-%d") + curr_comp['Time'], format='%Y-%m-%d%H:%M:%S')
            else:
                curr_comp["Datetime"] = pd.to_datetime(curr_comp["Datetime"])
            curr_comp = curr_comp.set_index("Datetime")
            series = curr_comp[value_name].sort_index().dropna()

            #Check if the length of a component is less than one
            if len(series) <= 1:
                raise Exception()
            
            series = series.asfreq(resolution)

            res[-1].append(pd.DataFrame({value_name : series}))
            id_l[-1].append(id)
            ts_id_l[-1].append(ts_id)
    return res, id_l, ts_id_l


def analyze_time_series(ts, ts_id, id):
    info = []
    
    # Check for duplicates
    info.append(f"Time Series ID: {ts_id}, Series ID: {id}")
    # Finding continuous non-NaN segments

    info.append(f"Start: {pd.Index(ts.index)[0]}")
    info.append(f"End: {pd.Index(ts.index)[-1]}")

    # Return the collected info as a string
    return "\n".join(info)

def make_info(file_name, output_file_name):
    ts_list, id_l, ts_id_l = multiple_ts_file_to_dfs(
            series_csv=file_name,
            resolution="1h",
            value_name="Value",
            format="long")

    f = open(output_file_name, "w+", encoding="utf-8")

    for ts_, id_, ts_id_ in zip(ts_list, id_l, ts_id_l):
        for ts, id, ts_id in zip(ts_, id_, ts_id_):
            ts_info = analyze_time_series(ts, ts_id, id)
            f.write(ts_info + "\n\n")

    f.close()

make_info(file_name="../TIMESCALE_TO_CSV/6H_long_2022_2023.csv", output_file_name="../TIMESCALE_TO_CSV/info_series.txt")


Turning multiple ts file to dataframe list...


100%|██████████| 49/49 [00:00<00:00, 104.25it/s]
