In [1]:
import os
import json
import glob
import pandas as pd
import streamlit as st
from paths import *

def convert_json_to_csv(input_folder, output_folder):
    messages = []
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
        messages.append(f"Created output folder: {output_folder}")
    
    json_files = glob.glob(os.path.join(input_folder, "*.json"))
    
    if not json_files:
        messages.append("No JSON files found in the folder.")
        return messages
    
    for json_file in json_files:
        try:
            with open(json_file, "r") as f:
                data = json.load(f)
            records = data.get("response", {}).get("data", [])
            df = pd.json_normalize(records)
            
            base_name = os.path.basename(json_file)
            file_name = os.path.splitext(base_name)[0]
            output_file = os.path.join(output_folder, file_name + ".csv")
            
            df.to_csv(output_file, index=False)
            messages.append(f"Converted {json_file} to {output_file}")
        except Exception as e:
            messages.append(f"Error processing {json_file}: {e}")
    
    return messages

def merge_csv_files(folder1, folder2, output_file):
    messages = []
    csv_files1 = glob.glob(os.path.join(folder1, "*.csv"))
    csv_files2 = glob.glob(os.path.join(folder2, "*.csv"))
    
    if not csv_files1 or not csv_files2:
        messages.append("No CSV files found in one or both folders.")
        return messages
    
    dfs1 = [pd.read_csv(file) for file in csv_files1]
    dfs2 = [pd.read_csv(file) for file in csv_files2]
    
    if not dfs1 or not dfs2:
        messages.append("No data to merge from one or both folders.")
        return messages
    
    df1 = pd.concat(dfs1, ignore_index=True, sort=False)
    df2 = pd.concat(dfs2, ignore_index=True, sort=False)
    merged_df = pd.concat([df1, df2], axis=1)

    # Ensure output directory exists
    output_dir = os.path.dirname(output_file)
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
        messages.append(f"Created output directory: {output_dir}")
    
    try:
        merged_df.to_csv(output_file, index=False)
        messages.append(f"Merged CSV saved as {output_file}")
    except Exception as e:
        messages.append(f"Error saving merged CSV: {e}")
    
    return messages
def merger():
    st.write("Converting JSON to CSV...")
    convert_json_to_csv(json_input_folder, csv_output_folder)

    st.write("Process completed!")

    st.write("Merging CSV files...")
    merge_csv_files(csv_folder1, csv_folder2, merged_csv_output)

    st.write("Process completed!")


In [3]:
import os
import pandas as pd
import numpy as np
import streamlit as st
from paths import *


def get_season(month):
    if month in [12, 1, 2]:
        return "Winter"
    elif month in [3, 4, 5]:
        return "Spring"
    elif month in [6, 7, 8]:
        return "Summer"
    else:
        return "Fall"

def clean_data(df, log):
    df.replace(r'^\s*$', np.nan, regex=True, inplace=True)
    log.append("Missing data percentage per column:")
    missing_pct = df.isna().mean() * 100
    log.append(missing_pct.to_string())

    numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    categorical_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()

    for col in numeric_cols:
        if col != "temperature_2m" and df[col].isna().sum() > 0:
            median_val = df[col].median()
            df[col] = df[col].fillna(median_val)
            log.append(f"Imputed missing values in numeric column '{col}' with median: {median_val}")

    for col in categorical_cols:
        if df[col].isna().sum() > 0:
            mode_val = df[col].mode()[0]
            df[col] = df[col].fillna(mode_val)
            log.append(f"Imputed missing values in categorical column '{col}' with mode: {mode_val}")

    for col in df.columns:
        if col.lower() in ['period', 'date'] or ("date" in col.lower() or "time" in col.lower()):
            try:
                df[col] = pd.to_datetime(df[col], errors='coerce')
                if df[col].isna().sum() > 0 and df[col].notna().sum() > 0:
                    mode_date = df[col].mode()[0]
                    df[col] = df[col].fillna(mode_date)
                    log.append(f"Imputed missing values in date column '{col}' with mode: {mode_date}")
                df[col] = pd.to_datetime(df[col], errors='coerce')
                log.append(f"Converted column '{col}' to datetime.")

                prefix = "extracted_" + col
                if df[col].notna().sum() > 0:
                    df[prefix + "_hour"] = df[col].dt.hour
                    df[prefix + "_day"] = df[col].dt.day
                    df[prefix + "_month"] = df[col].dt.month
                    df[prefix + "_dayofweek"] = df[col].dt.dayofweek
                    for feat, rng in [("_hour", (0,23)), ("_day", (1,28)), ("_month", (1,12)), ("_dayofweek", (0,6))]:
                        feat_col = prefix + feat
                        if df[feat_col].isna().sum() > 0 and df[feat_col].notna().sum() > 0:
                            median_val = int(round(df[feat_col].median()))
                            df[feat_col] = df[feat_col].fillna(median_val)
                            log.append(f"Imputed missing values in {feat_col} with median: {median_val}")
                    df[prefix + "_is_weekend"] = df[prefix + "_dayofweek"].apply(lambda x: x >= 5)
                    df[prefix + "_season"] = df[prefix + "_month"].apply(get_season)
                else:
                    log.append(f"Column '{col}' has no valid dates; no temporal features extracted.")
            except Exception as e:
                log.append(f"Error processing date column '{col}': {e}")

    initial_rows = df.shape[0]
    df.drop_duplicates(inplace=True)
    duplicates_removed = initial_rows - df.shape[0]
    if duplicates_removed > 0:
        log.append(f"Removed {duplicates_removed} duplicate rows.")

    constant_columns = [col for col in df.columns if df[col].nunique() <= 1 and col.lower() != "value"]
    if constant_columns:
        df.drop(columns=constant_columns, inplace=True)
        log.append(f"Dropped constant columns (excluding 'value'): {constant_columns}")

    outlier_summary = {}
    for col in numeric_cols:
        if col in df.columns and col != "temperature_2m":
            Q1 = df[col].quantile(0.25)
            Q3 = df[col].quantile(0.75)
            IQR = Q3 - Q1
            lower_bound = Q1 - 1.5 * IQR
            upper_bound = Q3 + 1.5 * IQR
            before_rows = df.shape[0]
            df = df[(df[col] >= lower_bound) & (df[col] <= upper_bound)]
            after_rows = df.shape[0]
            removed = before_rows - after_rows
            if removed > 0:
                outlier_summary[col] = f"Removed {removed} outliers"
                log.append(f"Removed {removed} outliers from column '{col}'.")
            else:
                outlier_summary[col] = "No outliers"
                log.append(f"No outliers found in column '{col}'.")

    if "temperature_2m" in df.columns:
        if df["temperature_2m"].nunique() > 1:
            Q1 = df["temperature_2m"].quantile(0.25)
            Q3 = df["temperature_2m"].quantile(0.75)
            IQR = Q3 - Q1
            lower_bound = Q1 - 1.5 * IQR
            upper_bound = Q3 + 1.5 * IQR
            df["temperature_anomaly"] = ~df["temperature_2m"].between(lower_bound, upper_bound)
            n_anomalies = df["temperature_anomaly"].sum()
            outlier_summary["temperature_2m"] = f"Marked {n_anomalies} anomalies"
            log.append(f"Marked {n_anomalies} anomalies in 'temperature_2m'.")
            df = df.sort_values(by="temperature_anomaly", ascending=True).reset_index(drop=True)
        else:
            outlier_summary["temperature_2m"] = "No outliers (constant)"
            log.append("Temperature column 'temperature_2m' is constant; no anomaly marking applied.")

    for col in numeric_cols:
        if col in df.columns:
            std_col = col + "_std"
            df[std_col] = (df[col] - df[col].mean()) / df[col].std()
            log.append(f"Created standardized feature '{std_col}'.")

    log.append("\n***** Outlier Summary *****")
    for col, summary in outlier_summary.items():
        log.append(f"{col}: {summary}")

    return df, outlier_summary
def cleaner():
    log = []

    input_file = merged_csv_output
    output_file = os.path.join(output_dir, "cleaned.csv")

    # Verify input file exists
    if not os.path.exists(input_file):
        st.write(f"Input file does not exist: {input_file}")
        return
    
    # Create output directory if it does not exist
    os.makedirs(output_dir, exist_ok=True)

    # Load the input CSV
    try:
        df = pd.read_csv(input_file)
    except Exception as e:
        st.write(f"Error reading CSV: {e}")
        return
    
    log.append(f"Original DataFrame shape: {df.shape}")
    
    # Clean the data
    cleaned_df, outlier_summary = clean_data(df, log)
    log.append(f"Cleaned DataFrame shape: {cleaned_df.shape}")
    
    # Save the cleaned DataFrame
    try:
        cleaned_df.to_csv(output_file, index=False)
        log.append(f"Cleaned CSV saved as {output_file}")
        st.write(f"Cleaned CSV saved as {output_file}")
    except Exception as e:
        log.append(f"Error saving cleaned CSV: {e}")
        st.write(f"Error saving cleaned CSV: {e}")
    
    # Generate summary statistics and print log
    try:
        summary_stats = cleaned_df.describe(include='all')
        st.write("Summary Statistics for Cleaned Data")
        st.write(summary_stats)
        st.write("\n***** Processing Log *****")
        st.write(pd.DataFrame(log))
    except Exception as e:
        st.write(f"Error generating summary statistics: {e}")


In [5]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.tsa.stattools import adfuller
import streamlit as st
from paths import *

def statistical_summary(df, output_dir):
    """
    Compute key statistical metrics for selected numerical features and save to "Statistical_Summary.txt".
    """
    key_features = ['value', 'temperature_2m', 'extracted_period_hour',
                    'extracted_period_day', 'extracted_period_month', 'extracted_period_dayofweek']
    existing_features = [col for col in key_features if col in df.columns]
    
    summary = df[existing_features].describe()
    extra_stats = pd.DataFrame({
        'skewness': df[existing_features].skew(),
        'kurtosis': df[existing_features].kurtosis()
    })
    summary = pd.concat([summary, extra_stats])
    
    st.write(summary)

def time_series_analysis(df, output_dir):
    """
    Plot electricity demand over time.
    """
    df['date'] = pd.to_datetime(df['date'], errors='coerce')
    df_sorted = df.sort_values(by='date').dropna(subset=['date', 'value'])
    
    plt.figure(figsize=(12,6))
    plt.plot(df_sorted['date'], df_sorted['value'], label='Electricity Demand', color='blue')
    plt.xlabel("Time")
    plt.ylabel("Electricity Demand")
    plt.title("Electricity Demand Over Time")
    plt.legend()
    plt.grid(True)

    plt.tight_layout()
    st.pyplot(plt)

def univariate_analysis(df, output_dir):
    """
    Generate histograms, boxplots, and density plots for key numerical features.
    """
    key_features = [
        "extracted_period_hour", "extracted_period_day", "extracted_period_month", 
        "extracted_period_dayofweek", "temperature_2m"
    ]
    df = df[key_features].dropna()
    
    fig, axes = plt.subplots(len(key_features), 3, figsize=(15, 5*len(key_features)))
    
    for i, col in enumerate(key_features):
        col_data = df[col]
        
        sns.histplot(col_data, bins=30, kde=True, ax=axes[i, 0], color='skyblue')
        axes[i, 0].set_title(f"Histogram of {col}")
        
        sns.boxplot(x=col_data, ax=axes[i, 1], color='lightcoral')
        axes[i, 1].set_title(f"Boxplot of {col}")
        
        sns.kdeplot(col_data, ax=axes[i, 2], color='purple')
        axes[i, 2].set_title(f"Density Plot of {col}")
    
    plt.tight_layout()
    st.pyplot(plt)

def correlation_analysis(df):
    """
    Compute correlation matrix and visualize using a heatmap.
    """
    key_features = ['value', 'temperature_2m', 'extracted_period_hour',
                    'extracted_period_day', 'extracted_period_month', 'extracted_period_dayofweek']
    df = df[key_features].dropna()
    
    plt.figure(figsize=(8,6))
    sns.heatmap(df.corr(), annot=True, fmt=".2f", cmap="coolwarm", square=True)
    plt.title("Correlation Matrix")
    
    st.pyplot(plt)
    
def advanced_time_series_techniques(df):
    """
    Perform time series decomposition and ADF test.
    """
    df['date'] = pd.to_datetime(df['date'], errors='coerce')
    df_sorted = df.sort_values(by='date').dropna(subset=['date', 'value'])
    ts = df_sorted.set_index('date')['value']
    
    decomposition = seasonal_decompose(ts, model='additive', period=24)
    fig = decomposition.plot()
    fig.set_size_inches(12, 8)
    plt.tight_layout()

    st.pyplot(fig)

    adf_result = adfuller(ts.dropna())
    adf_output = (f"ADF Statistic: {adf_result[0]:.4f}\n"
                    f"p-value: {adf_result[1]:.4f}\n"
                    f"Critical Values: {adf_result[4]}")
    
    st.write(adf_output)

def run_eda(input_csv):
    """
    Load dataset, run all EDA functions, and save outputs.
    """

    df = pd.read_csv(input_csv)
    st.write(f"Loaded dataset with shape: {df.shape}")
    
    statistical_summary(df, output_dir)
    time_series_analysis(df, output_dir)
    univariate_analysis(df, output_dir)
    correlation_analysis(df)
    advanced_time_series_techniques(df)



In [7]:
import streamlit as st
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import zscore
from paths import *

def detect_outliers_iqr(data, column):
    Q1 = data[column].quantile(0.25)
    Q3 = data[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return (data[column] < lower_bound) | (data[column] > upper_bound), lower_bound, upper_bound

def detect_outliers_zscore(data, column, threshold=3):
    return np.abs(zscore(data[column])) > threshold

def plot_before_after(df_original, df_capped, column):
    fig, axes = plt.subplots(1, 2, figsize=(10, 5))
    
    # Before
    axes[0].boxplot(df_original[column].dropna(), vert=True)
    axes[0].set_title(f"Before Capping - {column}")
    
    # After
    axes[1].boxplot(df_capped[column].dropna(), vert=True)
    axes[1].set_title(f"After Capping - {column}")
    
    plt.tight_layout()
    st.pyplot(fig)
    

def handle_outliers(df, num_cols):
    outlier_summary = []
    total_outliers = 0

    for col in num_cols:
        if col in df.columns:
            # IQR-based detection
            iqr_mask, lower_bound, upper_bound = detect_outliers_iqr(df, col)
            
            # Z-score detection
            z_mask = detect_outliers_zscore(df, col)
            
            # Combine both outlier masks (logical OR)
            combined_mask = iqr_mask | z_mask
            
            # Count outliers before capping
            outlier_count = combined_mask.sum()
            total_outliers += outlier_count
            
            # Record summary
            outlier_summary.append({
                "column": col,
                "iqr_outliers": iqr_mask.sum(),
                "zscore_outliers": z_mask.sum(),
                "combined_outliers": outlier_count,
                "lower_bound": lower_bound,
                "upper_bound": upper_bound
            })
            
            # cap outliers
            df.loc[df[col] < lower_bound, col] = lower_bound
            df.loc[df[col] > upper_bound, col] = upper_bound

    return df, outlier_summary, total_outliers

def generate_report(outlier_summary, total_outliers):
    st.write("Outlier Handling Report")

    st.write("Column-by-Column Summary:")
    for summary in outlier_summary:
        st.write(
            f"- {summary['column']}:\n"
            f"   IQR outliers: {summary['iqr_outliers']}\n"
            f"   Z-score outliers: {summary['zscore_outliers']}\n"
            f"   Combined outliers: {summary['combined_outliers']}\n"
            f"   Capping range: [{summary['lower_bound']:.2f}, {summary['upper_bound']:.2f}]\n"
        )

    st.write(f"Total Outliers (across all columns): {total_outliers}\n")

    st.write("Technical Rationale:")
    st.write("- Outliers can distort mean/variance, affect model performance, and skew visualizations.")
    st.write("- By capping rather than removing, we retain data size while mitigating extreme skew but this cause data to be biased.\n")


    st.write("Decision:")
    st.write("  We applied capping based on IQR boundaries.\n")
def outlier():
    num_cols = ["value", "temperature_2m", "value_std", "temperature_2m_std"]
    
    df = pd.read_csv(input_file)
    
    df_original = df.copy()
    
    df, outlier_summary, total_outliers = handle_outliers(df, num_cols)
    
    for col in num_cols:
        if col in df.columns:
            plot_before_after(df_original, df, col)
    
    generate_report(outlier_summary, total_outliers)
    st.write("Outlier detection, capping, and reporting complete.")




In [9]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import streamlit as st
from paths import *  

def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test) 
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)
    
    return y_pred, mse, rmse, r2

def plot_actual_vs_predicted(y_test, y_pred):
    plt.figure(figsize=(8, 6))
    plt.scatter(y_test, y_pred, alpha=0.5, color='blue')
    plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], color='red', linestyle='--')
    plt.xlabel("Actual Electricity Demand")
    plt.ylabel("Predicted Electricity Demand")
    plt.title("Actual vs. Predicted Electricity Demand")
    st.pyplot(plt)

def residual_analysis(y_test, y_pred):
    residuals = y_test - y_pred
    plt.figure(figsize=(8, 6))
    sns.histplot(residuals, bins=30, kde=True)
    plt.axvline(x=0, color='red', linestyle='--')
    plt.xlabel("Residuals")
    plt.ylabel("Frequency")
    plt.title("Residual Analysis")
    st.pyplot(plt)

def plot_model_performance(mse, rmse, r2):
    metrics = [mse, rmse, r2]
    metric_names = ['Mean Squared Error', 'Root Mean Squared Error', 'R² Score']
    
    plt.figure(figsize=(8, 5))
    sns.barplot(x=metric_names, y=metrics, palette='viridis')
    plt.ylabel('Score')
    plt.title('Model Performance Metrics')
    plt.ylim(0, max(metrics) * 1.1)  
    st.pyplot(plt)

def regression():
    features = [
        "extracted_period_hour", "extracted_period_day", "extracted_period_month", 
        "extracted_period_dayofweek", "temperature_2m"
    ]
    target = "value"  
    # Load and preprocess data
    df = pd.read_csv(input_file)
    df[features] = df[features].fillna(df[features].median())
    df[target] = df[target].fillna(df[target].median())
    x,y = df[features],df[target]

    # Split data
    X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

    # Train model
    model = LinearRegression()
    model.fit(X_train, y_train)


    # Evaluate model
    y_pred, mse, rmse, r2 = evaluate_model(model, X_test, y_test)

    # Print evaluation metrics
    st.write(f"Mean Squared Error (MSE): {mse}")
    st.write(f"Root Mean Squared Error (RMSE): {rmse}")
    st.write(f"R² Score: {r2}")
    st.write("Model R^2 Score:", model.score(X_test, y_test))

    # Plot results
    plot_actual_vs_predicted(y_test, y_pred)
    residual_analysis(y_test, y_pred)
    plot_model_performance(mse, rmse, r2)


In [None]:
from eda import run_eda
from loader import merger
from outlier import outlier
from processor import cleaner
from regression import regression
from paths import input_file
import streamlit as st

if __name__ == "__main__":
    st.title("Electricity Demand Forecasting")
    st.header("Introduction")
    st.write("This application performs data preprocessing, exploratory data analysis, outlier detection, and regression analysis on electricity demand data.")
    st.header("Data Loading")
    merger()
    st.header("Data Preprocessing")
    cleaner()
    st.header("Exploratory Data Analysis")
    run_eda(input_file)
    st.header("Outlier Detection")
    outlier()
    st.header("Regression Analysis")
    regression()
    


2025-03-02 11:48:01.216 
  command:

    streamlit run C:\Users\Taqi\anaconda3\Lib\site-packages\ipykernel_launcher.py [ARGUMENTS]
  df = pd.read_csv(input_file)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["temperature_anomaly"] = ~df["temperature_2m"].between(lower_bound, upper_bound)
2025-03-02 11:48:40.472 Serialization of dataframe to Arrow table was unsuccessful due to: ("Could not convert Timestamp('2024-01-09 19:46:08.115450880+0000', tz='UTC') with type Timestamp: tried to convert to int64", 'Conversion failed for column date with type object'). Applying automatic fixes for column types to make the dataframe Arrow-compatible.
