In [None]:
import pandas as pd
file_path = '/Users/moji/PyTSF-MfG/data/Steel_industry2.csv'
df = pd.read_csv(file_path)
df

In [None]:
# Display basic information about the dataset
print("Dataset info:")
print(df.info())

In [None]:
print("\nDataset head:")
print(df.head())
# Check for duplicate timestamps
duplicates = df[df.duplicated(subset=['Date_Time'], keep=False)]
print(f"\nNumber of rows with duplicate timestamps: {len(duplicates)}")


In [None]:
if len(duplicates) > 0:
    print("\nSample of duplicate timestamps:")
    print(duplicates.head(10))  # Show first 10 duplicates

    # Option 1: Keep the first occurrence of each timestamp
    df_deduped = df.drop_duplicates(subset=['Date_Time'], keep='first')

    # Option 2: Aggregate data for duplicate timestamps (e.g., taking the mean)
    # df_deduped = df.groupby(['unique_id', 'ds'])['y'].mean().reset_index()

    print(f"\nShape of dataset after deduplication: {df_deduped.shape}")

    # Check if deduplication resolved the issue
    if df_deduped['Date_Time'].is_unique:
        print("Deduplication successful. Timestamps are now unique.")
    else:
        print("Deduplication did not resolve all duplicates. Further investigation needed.")

    # # Save the deduplicated dataset
    # output_path = '/Users/moji/PyTSF-MfG/data/deduplicated_dataset.csv'  # Replace with your desired output path
    # df_deduped.to_csv(output_path, index_label='id')
    # print(f"\nDeduplicated dataset saved as: {output_path}")

else:
    print("No duplicate timestamps found. The issue may be elsewhere.")

In [None]:
import pandas as pd
# Make an explicit copy to ensure we're working with a new DataFrame
df_deduped = df_deduped.copy()

print("\nDate range:")
print(f"Start: {df_deduped['Date_Time'].min()}")
print(f"End: {df_deduped['Date_Time'].max()}")

# Convert 'Date_Time' to datetime using .loc
df_deduped.loc[:, 'Date_Time'] = pd.to_datetime(df_deduped['Date_Time'], format='%d-%m-%Y %H:%M')

# Check for any inconsistencies in the time intervals
time_diff = df_deduped['Date_Time'].diff()
inconsistent_intervals = time_diff[time_diff != pd.Timedelta(minutes=15)]
if not inconsistent_intervals.empty:
    print("\nInconsistent time intervals found:")
    print(inconsistent_intervals.head())
else:
    print("\nAll time intervals are consistent (15 minutes).")

In [None]:
# Load the dataset
df = df_deduped.copy()
# Convert 'Date' column to datetime
df['Date_Time'] = pd.to_datetime(df['Date_Time'], format='%d-%m-%Y %H:%M')
# Set 'Date' as the index
df.set_index('Date_Time', inplace=True)
# Sort the index to ensure chronological order
df.sort_index(inplace=True)
# Create a complete date range including weekends
date_range = pd.date_range(start=df.index.min(), end=df.index.max(), freq='15min')
# Reindex the dataframe with the complete date range
df_reindexed = df.reindex(date_range)
# Check for missing values after reindexing
print("Missing values after reindexing (including weekends):")
print(df_reindexed.isnull().sum())
# Interpolate missing values
df_interpolated = df_reindexed.interpolate(method='time')
# Check for missing values after interpolation
print("\nMissing values after interpolation:")
print(df_interpolated.isnull().sum())
# Fill any remaining NaN values at the beginning or end with the nearest valid value
df_interpolated = df_interpolated.fillna(method='bfill').fillna(method='ffill')

# Save the new dataset without missing values
output_path = '/Users/moji/PyTSF-MfG/data/steel_Interpolated.csv'
df_interpolated.to_csv(output_path)
print(f"\nNew dataset saved as '{output_path}'")

# Display the first few rows of the interpolated dataset
print("\nFirst few rows of the interpolated dataset:")
print(df_interpolated.head())

# Display basic statistics of the interpolated dataset
print("\nBasic statistics of the interpolated dataset:")
print(df_interpolated.describe())

In [None]:
data_path = '/Users/moji/PyTSF-MfG/data'
datasets = load_datasets_statforecast_uni(data_path)

In [None]:
import pandas as pd
import numpy as np

# Load the dataset
df = pd.read_csv('/Users/moji/PyTSF-MfG/data/BrentOilPrices.csv')
# Convert 'Date' column to datetime
df['Date'] = pd.to_datetime(df['Date'], format='mixed')
# Set 'Date' as the index
df.set_index('Date', inplace=True)
# Sort the index to ensure chronological order
df.sort_index(inplace=True)
# Create a complete date range including weekends
date_range = pd.date_range(start=df.index.min(), end=df.index.max(), freq='D')
# Reindex the dataframe with the complete date range
df_reindexed = df.reindex(date_range)
# Check for missing values after reindexing
print("Missing values after reindexing (including weekends):")
print(df_reindexed.isnull().sum())
# Interpolate missing values
df_interpolated = df_reindexed.interpolate(method='time')
# Check for missing values after interpolation
print("\nMissing values after interpolation:")
print(df_interpolated.isnull().sum())
# Fill any remaining NaN values at the beginning or end with the nearest valid value
df_interpolated = df_interpolated.fillna(method='bfill').fillna(method='ffill')

# Save the new dataset without missing values
output_path = '/Users/moji/PyTSF-MfG/data/BrentOilPrices_Interpolated.csv'
df_interpolated.to_csv(output_path)
print(f"\nNew dataset saved as '{output_path}'")

# Display the first few rows of the interpolated dataset
print("\nFirst few rows of the interpolated dataset:")
print(df_interpolated.head())

# Display basic statistics of the interpolated dataset
print("\nBasic statistics of the interpolated dataset:")
print(df_interpolated.describe())

In [None]:
import pandas as pd
import numpy as np

# Load the dataset
df = pd.read_csv('/Users/moji/PyTSF-MfG/data/ISO-NY_Central.csv')
print(df.shape)
# Convert 'Time_Stamp' column to datetime
df['Time_Stamp'] = pd.to_datetime(df['Time_Stamp'])

# Set 'Time_Stamp' as the index
df.set_index('Time_Stamp', inplace=True)

# Sort the index to ensure chronological order
df.sort_index(inplace=True)

# Create a complete time range with 15-minute intervals
time_range = pd.date_range(start=df.index.min(), end=df.index.max(), freq='15min')

# Reindex the dataframe with the complete time range
df_reindexed = df.reindex(time_range)
print(df_reindexed.shape)
# Check for missing values after reindexing
print("Missing values after reindexing:")
print(df_reindexed.isnull().sum())
# Interpolate missing values
df_interpolated = df_reindexed.interpolate(method='time')
# Check for missing values after interpolation
print("\nMissing values after interpolation:")
print(df_interpolated.isnull().sum())
# Fill any remaining NaN values at the beginning or end with the nearest valid value
df_interpolated = df_interpolated.fillna(method='bfill').fillna(method='ffill')
# Final check for missing values
print("\nFinal check for missing values:")
print(df_interpolated.isnull().sum())

# Save the new dataset without missing values
output_path = '/Users/moji/PyTSF-MfG/data/LoadData_Interpolated.csv'
df_interpolated.to_csv(output_path)

print(f"\nNew dataset saved as '{output_path}'")

# Display the first few rows of the interpolated dataset
print("\nFirst few rows of the interpolated dataset:")
print(df_interpolated.head())

# Display basic statistics of the interpolated dataset
print("\nBasic statistics of the interpolated dataset:")
print(df_interpolated.describe())
# Check for any gaps in the time series
time_diff = df_interpolated.index.to_series().diff()
gaps = time_diff[time_diff > pd.Timedelta(minutes=15)]
if not gaps.empty:
    print("\nGaps found in the time series:")
    print(gaps)
else:
    print("\nNo gaps found in the time series.")