In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
file_path=pd.read_csv('C:/Users/HDC0422272/Downloads/PRICE_AND_DEMAND_201801_NSW1.csv')
def load_data(file_path):

    """Load the time series data from a CSV file."""
    data = pd.read_csv(file_path)
    return data

def clean_data(data, timestamp_column):
    """Clean the time series data."""
    data[timestamp_column] = pd.to_datetime(data[timestamp_column], errors='coerce')

    data = data.dropna(subset=[timestamp_column])

    data = data.drop_duplicates()

    data = data.fillna(method='ffill').fillna(method='bfill')

    return data

# Step 3: Handle Time Series Data
def handle_time_series(data, timestamp_column):
    """Set the timestamp as the index and sort the data."""
    data = data.sort_values(by=timestamp_column)
    data.set_index(timestamp_column, inplace=True)
    return data

# Step 4: Preprocessing Techniques
def preprocess_data(data, columns_to_scale):
    """Apply preprocessing techniques such as normalization."""
    scaler = MinMaxScaler()
    data[columns_to_scale] = scaler.fit_transform(data[columns_to_scale])
    return data, scaler

def resample_data(data, frequency, aggregation_method='mean'):
    """Resample the time series data to a specified frequency."""
    resampled_data = data.resample(frequency).apply(aggregation_method)
    return resampled_data

def create_lag_features(data, column, lag_count):
    """Create lag features for a specified column."""
    for lag in range(1, lag_count + 1):
        data[f'{column}_lag_{lag}'] = data[column].shift(lag)
    return data

# Main Function
def main():
    # File path for the uploaded CSV file
    file_path = 'C:/Users/HDC0422272/Downloads/PRICE_AND_DEMAND_201801_NSW1.csv'
    timestamp_column = 'SETTLEMENTDATE'  # Adjust this column name if necessary

    # Load the data
    data = load_data(file_path)
    print("Data Loaded Successfully")

    # Clean the data
    data = clean_data(data, timestamp_column)
    print("Data Cleaning Complete")

    # Handle time series data
    data = handle_time_series(data, timestamp_column)
    print("Time Series Handling Complete")

    # Preprocessing
    numeric_columns = data.select_dtypes(include=['float64', 'int64']).columns
    data, scaler = preprocess_data(data, numeric_columns)
    print("Data Preprocessing Complete")

    # Resample the data
    data = resample_data(data, frequency='1H')
    print("Data Resampling Complete")

    # Create lag features
    target_column = numeric_columns[0]  # Use the first numeric column as an example
    data = create_lag_features(data, target_column, lag_count=3)
    print("Lag Features Creation Complete")

    # Display final data
    print(data.head())

    # Save to a new CSV
    data.to_csv('C:/Users/HDC0422272/Downloads/us-counties-2020.csv')
    print("Processed data saved as 'processed_time_series.csv'")

if __name__ == "__main__":
    main()


Data Loaded Successfully
Data Cleaning Complete
Time Series Handling Complete
Data Preprocessing Complete
Data Resampling Complete
Lag Features Creation Complete
                     TOTALDEMAND       RRP  TOTALDEMAND_lag_1  \
SETTLEMENTDATE                                                  
2018-01-01 00:00:00     0.152204  0.264025                NaN   
2018-01-01 01:00:00     0.105670  0.209290           0.152204   
2018-01-01 02:00:00     0.048370  0.153474           0.105670   
2018-01-01 03:00:00     0.019955  0.146655           0.048370   
2018-01-01 04:00:00     0.007755  0.137802           0.019955   

                     TOTALDEMAND_lag_2  TOTALDEMAND_lag_3  
SETTLEMENTDATE                                             
2018-01-01 00:00:00                NaN                NaN  
2018-01-01 01:00:00                NaN                NaN  
2018-01-01 02:00:00           0.152204                NaN  
2018-01-01 03:00:00           0.105670           0.152204  
2018-01-01 04:00:00   

  resampled_data = data.resample(frequency).apply(aggregation_method)
