<a href="https://colab.research.google.com/github/yayesh16/solar-challenge-week/blob/eda-zambia/Zambia.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
!git clone https://github.com/yayesh16/solar-challenge-week.git

Cloning into 'solar-challenge-week'...
remote: Enumerating objects: 16, done.[K
remote: Counting objects: 100% (16/16), done.[K
remote: Compressing objects: 100% (14/14), done.[K
remote: Total 16 (delta 4), reused 0 (delta 0), pack-reused 0 (from 0)[K
Receiving objects: 100% (16/16), done.
Resolving deltas: 100% (4/4), done.


In [4]:
%cd solar-challenge-week/

/content/solar-challenge-week


In [8]:
!mkdir -p data

In [11]:
!cp "/content/drive/MyDrive/10acadamy/Solar data/zambia.csv" data/

In [12]:
!ls data/

zambia.csv


In [13]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import zscore
import os
from windrose import WindroseAxes

# --- 1. Configuration ---
COUNTRY_NAME = "Zambia"
DATA_PATH = f"data/zambia.csv"
CLEANED_DATA_PATH = f"data/zambia_clean.csv"

print(f"Starting EDA for zambia")
print(f"Attempting to load data from: data/zambia.csv")

# --- 2. Data Loading ---
try:
    df = pd.read_csv("data/zambia.csv")
    print("Data loaded successfully.")
except FileNotFoundError:
    print(f"Error: data/zambia.csv not found. Please ensure the CSV file is in the 'data/' directory.")
    exit()
except Exception as e:
    print(f"An error occurred while loading the data: {e}")
    exit()

df_original = df.copy()

# Convert 'Date (UTC)' to datetime and set as index
timestamp_column_name = 'Date (UTC)'
if timestamp_column_name in df.columns:
    df[timestamp_column_name] = pd.to_datetime(df[timestamp_column_name], errors='coerce')
    df.dropna(subset=[timestamp_column_name], inplace=True)
    df.set_index(timestamp_column_name, inplace=True)
    df.sort_index(inplace=True)
    print(f"{timestamp_column_name} column processed and set as index.")
    df.index.name = 'Timestamp'
else:
    print(f"Warning: '{timestamp_column_name}' column not found. Please check your CSV column names.")
    print("Available columns:", df.columns.tolist())

# --- RENAME COLUMNS FOR CONSISTENCY WITH SCRIPT'S EXPECTATIONS ---
# This map is based on typical IFC solar data column names.
# If your CSV's column names differ, adjust the LEFT side of the 'key: value' pairs.
column_rename_map = {
    'GHI (W/m2)': 'GHI',
    'DNI (W/m2)': 'DNI',
    'DHI (W/m2)': 'DHI',
    'Module Temp (C) (logger 1)': 'ModA',
    'Module Temp (C) (logger 2)': 'ModB',
    'Ambient Temp (C)': 'Tamb',
    'Relative Humidity (%)': 'RH',
    'Wind Speed (m/s)': 'WS',
    'Wind Speed Gust (m/s)': 'WSgust',
    'Wind Direction (deg)': 'WD',
    'Barometric Pressure (mBar)': 'BP'
}

df.rename(columns={k: v for k, v in column_rename_map.items() if k in df.columns}, inplace=True)
df_original.rename(columns={k: v for k, v in column_rename_map.items() if k in df_original.columns}, inplace=True)

print("\n--- Initial DataFrame Info ---")
df.info()
print("\nFirst 5 rows:")
print(df.head())

# --- 3. Summary Statistics & Missing-Value Report ---
print("\n--- Summary Statistics for Numeric Columns ---")
print(df.describe())

print("\n--- Missing Value Report ---")
missing_values = df.isna().sum()
missing_percentage = (df.isna().sum() / len(df)) * 100
missing_df = pd.DataFrame({
    'Missing Count': missing_values,
    'Missing Percentage': missing_percentage
})
print(missing_df[missing_df['Missing Count'] > 0].sort_values(by='Missing Percentage', ascending=False))

high_null_cols = missing_percentage[missing_percentage > 5].index.tolist()
if high_null_cols:
    print(f"\nColumns with >5% nulls: {high_null_cols}")
else:
    print("\nNo columns with >5% nulls.")

# --- 4. Outlier Detection & Basic Cleaning ---
print("\n--- Outlier Detection and Cleaning ---")

zscore_cols = ['GHI', 'DNI', 'DHI', 'ModA', 'ModB', 'WS', 'WSgust']
zscore_cols_existing = [col for col in zscore_cols if col in df.columns]

df['cleaning_flag'] = 'original'

for col in zscore_cols_existing:
    if df[col].dtype in ['float64', 'int64']:
        temp_series = df[col].dropna()
        if not temp_series.empty:
            z_scores = np.abs(zscore(temp_series))
            outlier_mask = pd.Series(False, index=df.index)
            outlier_mask.loc[temp_series.index] = (z_scores > 3)
            df.loc[outlier_mask, 'cleaning_flag'] = 'outlier_flagged'
            print(f"Flagged {outlier_mask.sum()} outliers in column: {col}")
        else:
            print(f"Column '{col}' is entirely NaN, skipping Z-score calculation.")
    else:
        print(f"Skipping Z-score for non-numeric column: {col}")

key_columns_for_imputation = ['GHI', 'DNI', 'DHI', 'ModA', 'ModB', 'WS', 'WSgust', 'Tamb', 'RH', 'BP']
key_columns_for_imputation_existing = [col for col in key_columns_for_imputation if col in df.columns]

for col in key_columns_for_imputation_existing:
    if df[col].isna().sum() > 0:
        if df[col].dtype in ['float64', 'int64']:
            median_val = df[col].median()
            df[col].fillna(median_val, inplace=True)
            print(f"Imputed missing values in '{col}' with median: {median_val:.2f}")
        else:
            print(f"Skipping imputation for non-numeric column with NaNs: {col}")

print("\n--- Missing Value Report After Imputation ---")
print(df.isna().sum()[df.isna().sum() > 0])
if df.isna().sum().sum() == 0:
    print("No missing values remaining in the DataFrame.")
else:
    print("Some missing values still remain (likely in non-key columns or after timestamp processing).")

try:
    df.to_csv(CLEANED_DATA_PATH, index=True)
    print(f"\nCleaned DataFrame exported to: {CLEANED_DATA_PATH}")
    print("Remember to add 'data/' to your .gitignore file to prevent committing CSVs.")
except Exception as e:
    print(f"Error exporting cleaned data: {e}")

# --- 5. Time Series Analysis ---
print("\n--- Time Series Analysis ---")

time_series_cols = ['GHI', 'DNI', 'DHI', 'Tamb']
time_series_cols_existing = [col for col in time_series_cols if col in df.columns]

plt.figure(figsize=(18, 10))
for i, col in enumerate(time_series_cols_existing):
    plt.subplot(len(time_series_cols_existing), 1, i + 1)
    df[col].plot(title=f'{col} Over Time for zambia', grid=True)
    plt.ylabel(col)
plt.tight_layout()
plt.show()

if 'GHI' in df.columns:
    plt.figure(figsize=(12, 6))
    df['GHI'].resample('M').mean().plot(kind='bar', title=f'Average Monthly GHI for zambia', grid=True)
    plt.xlabel('Month')
    plt.ylabel('Average GHI')
    plt.show()

if 'GHI' in df.columns:
    plt.figure(figsize=(12, 6))
    df.groupby(df.index.hour)['GHI'].mean().plot(title=f'Average Hourly GHI for zambia', grid=True)
    plt.xlabel('Hour of Day')
    plt.ylabel('Average GHI')
    plt.xticks(range(0, 24))
    plt.show()

# --- 6. Cleaning Impact ---
print("\n--- Cleaning Impact Analysis ---")

if 'cleaning_flag' in df.columns and 'ModA' in df.columns and 'ModB' in df.columns:
    df_original_flagged = df_original.copy()
    if timestamp_column_name in df_original_flagged.columns:
        df_original_flagged[timestamp_column_name] = pd.to_datetime(df_original_flagged[timestamp_column_name], errors='coerce')
        df_original_flagged.dropna(subset=[timestamp_column_name], inplace=True)
        df_original_flagged.set_index(timestamp_column_name, inplace=True)
        df_original_flagged.index.name = 'Timestamp'

    df_original_flagged['cleaning_flag_pre_clean'] = 'original'
    for col in zscore_cols_existing:
        if col in df_original_flagged.columns and df_original_flagged[col].dtype in ['float64', 'int64']:
            temp_series_orig = df_original_flagged[col].dropna()
            if not temp_series_orig.empty:
                z_scores_orig = np.abs(zscore(temp_series_orig))
                outlier_mask_orig = pd.Series(False, index=df_original_flagged.index)
                outlier_mask_orig.loc[temp_series_orig.index] = (z_scores_orig > 3)
                df_original_flagged.loc[outlier_mask_orig, 'cleaning_flag_pre_clean'] = 'outlier_flagged'
            else:
                print(f"Original column '{col}' is entirely NaN, skipping outlier flagging for comparison.")

    print("\nAverage ModA & ModB (Original Data, grouped by potential outlier flag):")
    if 'ModA' in df_original_flagged.columns and 'ModB' in df_original_flagged.columns:
        print(df_original_flagged.groupby('cleaning_flag_pre_clean')[['ModA', 'ModB']].mean())
    else:
        print("ModA or ModB not found in original data for cleaning impact analysis.")

    print("\nAverage ModA & ModB (Cleaned Data, overall mean after imputation):")
    if 'ModA' in df.columns and 'ModB' in df.columns:
        print(df[['ModA', 'ModB']].mean())
    else:
        print("ModA or ModB not found in cleaned data for cleaning impact analysis.")

    plt.figure(figsize=(14, 6))

    if 'ModA' in df.columns and 'ModA' in df_original_flagged.columns:
        plt.subplot(1, 2, 1)
        sns.histplot(df_original_flagged['ModA'].dropna(), color='skyblue', label='Original ModA', kde=True)
        sns.histplot(df['ModA'], color='orange', label='Cleaned ModA', kde=True)
        plt.title(f'Distribution of ModA (Original vs. Cleaned) for {COUNTRY_NAME}')
        plt.legend()

    if 'ModB' in df.columns and 'ModB' in df_original_flagged.columns:
        plt.subplot(1, 2, 2)
        sns.histplot(df_original_flagged['ModB'].dropna(), color='lightgreen', label='Original ModB', kde=True)
        sns.histplot(df['ModB'], color='red', label='Cleaned ModB', kde=True)
        plt.title(f'Distribution of ModB (Original vs. Cleaned) for zambia')
        plt.legend()
    plt.tight_layout()
    plt.show()

else:
    print("Cannot perform cleaning impact analysis: 'cleaning_flag', 'ModA', or 'ModB' column missing.")

# --- 7. Correlation & Relationship Analysis ---
print("\n--- Correlation & Relationship Analysis ---")

correlation_cols = ['GHI', 'DNI', 'DHI', 'ModA', 'ModB', 'Tamb', 'RH', 'WS', 'WSgust', 'WD', 'BP']
correlation_cols_existing = [col for col in correlation_cols if col in df.columns]

if len(correlation_cols_existing) > 1:
    plt.figure(figsize=(10, 8))
    sns.heatmap(df[correlation_cols_existing].corr(), annot=True, cmap='coolwarm', fmt=".2f")
    plt.title(f'Correlation Heatmap for zambia')
    plt.show()
else:
    print("Not enough numeric columns for correlation heatmap.")

scatter_plots_config = [
    {'x': 'WS', 'y': 'GHI'},
    {'x': 'WSgust', 'y': 'GHI'},
    {'x': 'WD', 'y': 'GHI'},
    {'x': 'RH', 'y': 'Tamb'},
    {'x': 'RH', 'y': 'GHI'}
]

plt.figure(figsize=(18, 12))
for i, config in enumerate(scatter_plots_config):
    x_col = config['x']
    y_col = config['y']
    if x_col in df.columns and y_col in df.columns:
        plt.subplot(2, 3, i + 1)
        sns.scatterplot(data=df, x=x_col, y=y_col, alpha=0.6)
        plt.title(f'{y_col} vs. {x_col} for zambia')
        plt.xlabel(x_col)
        plt.ylabel(y_col)
    else:
        print(f"Skipping scatter plot: '{x_col}' or '{y_col}' not found.")
plt.tight_layout()
plt.show()

# --- 8. Wind & Distribution Analysis ---
print("\n--- Wind & Distribution Analysis ---")

if 'WS' in df.columns and 'WD' in df.columns:
    try:
        ax = WindroseAxes.from_ax()
        ax.bar(df['WD'], df['WS'], normed=True, opening=0.8, edgecolor='white')
        ax.set_legend()
        plt.title(f'Wind Rose Plot for zambia')
        plt.show()
        print("Wind rose plot generated.")
    except Exception as e:
        print(f"Could not generate wind rose plot (might need 'windrose' library or data issues): {e}")
else:
    print("Skipping wind rose plot: 'WS' or 'WD' column not found.")

histogram_cols = ['GHI', 'WS']
histogram_cols_existing = [col for col in histogram_cols if col in df.columns]

plt.figure(figsize=(12, 5))
for i, col in enumerate(histogram_cols_existing):
    plt.subplot(1, len(histogram_cols_existing), i + 1)
    sns.histplot(df[col].dropna(), kde=True)
    plt.title(f'Distribution of {col} for zambia')
    plt.xlabel(col)
    plt.ylabel('Frequency')
plt.tight_layout()
plt.show()

# --- 9. Temperature Analysis ---
print("\n--- Temperature Analysis ---")

if 'RH' in df.columns and 'Tamb' in df.columns and 'GHI' in df.columns:
    print(f"Correlation between RH and Tamb: {df['RH'].corr(df['Tamb']):.2f}")
    print(f"Correlation between RH and GHI: {df['RH'].corr(df['GHI']):.2f}")

    plt.figure(figsize=(10, 7))
    sns.scatterplot(data=df, x='Tamb', y='GHI', hue='RH', palette='viridis', alpha=0.7, size='RH', sizes=(20, 400))
    plt.title(f'GHI vs. Ambient Temperature colored by Relative Humidity for zambia')
    plt.xlabel('Ambient Temperature (Tamb)')
    plt.ylabel('Global Horizontal Irradiance (GHI)')
    plt.legend(title='Relative Humidity')
    plt.show()
else:
    print("Skipping detailed temperature analysis: 'RH', 'Tamb', or 'GHI' not found.")

# --- 10. Bubble Chart ---
print("\n--- Bubble Chart Analysis ---")

if 'GHI' in df.columns and 'Tamb' in df.columns and 'RH' in df.columns:
    plt.figure(figsize=(12, 8))
    sns.scatterplot(data=df, x='Tamb', y='GHI', size='RH', hue='RH', palette='viridis', sizes=(50, 1000), alpha=0.7)
    plt.title(f'GHI vs. Ambient Temperature with Bubble Size by Relative Humidity for zambia')
    plt.xlabel('Ambient Temperature (Tamb)')
    plt.ylabel('Global Horizontal Irradiance (GHI)')
    plt.legend(title='Relative Humidity', bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.grid(True, linestyle='--', alpha=0.6)
    plt.show()
elif 'GHI' in df.columns and 'Tamb' in df.columns and 'BP' in df.columns:
    plt.figure(figsize=(12, 8))
    sns.scatterplot(data=df, x='Tamb', y='GHI', size='BP', hue='BP', palette='cividis', sizes=(50, 1000), alpha=0.7)
    plt.title(f'GHI vs. Ambient Temperature with Bubble Size by Barometric Pressure for zambia')
    plt.xlabel('Ambient Temperature (Tamb)')
    plt.ylabel('Global Horizontal Irradiance (GHI)')
    plt.legend(title='Barometric Pressure', bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.grid(True, linestyle='--', alpha=0.6)
    plt.show()
else:
    print("Cannot generate bubble chart: Missing GHI, Tamb, and either RH or BP columns.")

print(f"\n--- EDA for zambia Completed ---")

ModuleNotFoundError: No module named 'windrose'

In [14]:
!pip install windrose

Collecting windrose
  Downloading windrose-1.9.2-py3-none-any.whl.metadata (5.2 kB)
Downloading windrose-1.9.2-py3-none-any.whl (20 kB)
Installing collected packages: windrose
Successfully installed windrose-1.9.2


In [15]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import zscore
import os
from windrose import WindroseAxes

# --- 1. Configuration ---
COUNTRY_NAME = "Zambia"
DATA_PATH = f"data/zambia.csv"
CLEANED_DATA_PATH = f"data/zambia_clean.csv"

print(f"Starting EDA for zambia")
print(f"Attempting to load data from: data/zambia.csv")

# --- 2. Data Loading ---
try:
    df = pd.read_csv("data/zambia.csv")
    print("Data loaded successfully.")
except FileNotFoundError:
    print(f"Error: data/zambia.csv not found. Please ensure the CSV file is in the 'data/' directory.")
    exit()
except Exception as e:
    print(f"An error occurred while loading the data: {e}")
    exit()

df_original = df.copy()

# Convert 'Date (UTC)' to datetime and set as index
timestamp_column_name = 'Date (UTC)'
if timestamp_column_name in df.columns:
    df[timestamp_column_name] = pd.to_datetime(df[timestamp_column_name], errors='coerce')
    df.dropna(subset=[timestamp_column_name], inplace=True)
    df.set_index(timestamp_column_name, inplace=True)
    df.sort_index(inplace=True)
    print(f"{timestamp_column_name} column processed and set as index.")
    df.index.name = 'Timestamp'
else:
    print(f"Warning: '{timestamp_column_name}' column not found. Please check your CSV column names.")
    print("Available columns:", df.columns.tolist())

# --- RENAME COLUMNS FOR CONSISTENCY WITH SCRIPT'S EXPECTATIONS ---
# This map is based on typical IFC solar data column names.
# If your CSV's column names differ, adjust the LEFT side of the 'key: value' pairs.
column_rename_map = {
    'GHI (W/m2)': 'GHI',
    'DNI (W/m2)': 'DNI',
    'DHI (W/m2)': 'DHI',
    'Module Temp (C) (logger 1)': 'ModA',
    'Module Temp (C) (logger 2)': 'ModB',
    'Ambient Temp (C)': 'Tamb',
    'Relative Humidity (%)': 'RH',
    'Wind Speed (m/s)': 'WS',
    'Wind Speed Gust (m/s)': 'WSgust',
    'Wind Direction (deg)': 'WD',
    'Barometric Pressure (mBar)': 'BP'
}

df.rename(columns={k: v for k, v in column_rename_map.items() if k in df.columns}, inplace=True)
df_original.rename(columns={k: v for k, v in column_rename_map.items() if k in df_original.columns}, inplace=True)

print("\n--- Initial DataFrame Info ---")
df.info()
print("\nFirst 5 rows:")
print(df.head())

# --- 3. Summary Statistics & Missing-Value Report ---
print("\n--- Summary Statistics for Numeric Columns ---")
print(df.describe())

print("\n--- Missing Value Report ---")
missing_values = df.isna().sum()
missing_percentage = (df.isna().sum() / len(df)) * 100
missing_df = pd.DataFrame({
    'Missing Count': missing_values,
    'Missing Percentage': missing_percentage
})
print(missing_df[missing_df['Missing Count'] > 0].sort_values(by='Missing Percentage', ascending=False))

high_null_cols = missing_percentage[missing_percentage > 5].index.tolist()
if high_null_cols:
    print(f"\nColumns with >5% nulls: {high_null_cols}")
else:
    print("\nNo columns with >5% nulls.")

# --- 4. Outlier Detection & Basic Cleaning ---
print("\n--- Outlier Detection and Cleaning ---")

zscore_cols = ['GHI', 'DNI', 'DHI', 'ModA', 'ModB', 'WS', 'WSgust']
zscore_cols_existing = [col for col in zscore_cols if col in df.columns]

df['cleaning_flag'] = 'original'

for col in zscore_cols_existing:
    if df[col].dtype in ['float64', 'int64']:
        temp_series = df[col].dropna()
        if not temp_series.empty:
            z_scores = np.abs(zscore(temp_series))
            outlier_mask = pd.Series(False, index=df.index)
            outlier_mask.loc[temp_series.index] = (z_scores > 3)
            df.loc[outlier_mask, 'cleaning_flag'] = 'outlier_flagged'
            print(f"Flagged {outlier_mask.sum()} outliers in column: {col}")
        else:
            print(f"Column '{col}' is entirely NaN, skipping Z-score calculation.")
    else:
        print(f"Skipping Z-score for non-numeric column: {col}")


key_columns_for_imputation = ['GHI', 'DNI', 'DHI', 'ModA', 'ModB', 'WS', 'WSgust', 'Tamb', 'RH', 'BP']
key_columns_for_imputation_existing = [col for col in key_columns_for_imputation if col in df.columns]

for col in key_columns_for_imputation_existing:
    if df[col].isna().sum() > 0:
        if df[col].dtype in ['float64', 'int64']:
            median_val = df[col].median()
            df[col].fillna(median_val, inplace=True)
            print(f"Imputed missing values in '{col}' with median: {median_val:.2f}")
        else:
            print(f"Skipping imputation for non-numeric column with NaNs: {col}")

print("\n--- Missing Value Report After Imputation ---")
print(df.isna().sum()[df.isna().sum() > 0])
if df.isna().sum().sum() == 0:
    print("No missing values remaining in the DataFrame.")
else:
    print("Some missing values still remain (likely in non-key columns or after timestamp processing).")

try:
    df.to_csv(CLEANED_DATA_PATH, index=True)
    print(f"\nCleaned DataFrame exported to: {CLEANED_DATA_PATH}")
    print("Remember to add 'data/' to your .gitignore file to prevent committing CSVs.")
except Exception as e:
    print(f"Error exporting cleaned data: {e}")

# --- 5. Time Series Analysis ---
print("\n--- Time Series Analysis ---")

time_series_cols = ['GHI', 'DNI', 'DHI', 'Tamb']
time_series_cols_existing = [col for col in time_series_cols if col in df.columns]

plt.figure(figsize=(18, 10))
for i, col in enumerate(time_series_cols_existing):
    plt.subplot(len(time_series_cols_existing), 1, i + 1)
    df[col].plot(title=f'{col} Over Time for zambia', grid=True)
    plt.ylabel(col)
plt.tight_layout()
plt.show()

if 'GHI' in df.columns:
    plt.figure(figsize=(12, 6))
    df['GHI'].resample('M').mean().plot(kind='bar', title=f'Average Monthly GHI for zambia', grid=True)
    plt.xlabel('Month')
    plt.ylabel('Average GHI')
    plt.show()

if 'GHI' in df.columns:
    plt.figure(figsize=(12, 6))
    df.groupby(df.index.hour)['GHI'].mean().plot(title=f'Average Hourly GHI for zambia', grid=True)
    plt.xlabel('Hour of Day')
    plt.ylabel('Average GHI')
    plt.xticks(range(0, 24))
    plt.show()

# --- 6. Cleaning Impact ---
print("\n--- Cleaning Impact Analysis ---")

if 'cleaning_flag' in df.columns and 'ModA' in df.columns and 'ModB' in df.columns:
    df_original_flagged = df_original.copy()
    if timestamp_column_name in df_original_flagged.columns:
        df_original_flagged[timestamp_column_name] = pd.to_datetime(df_original_flagged[timestamp_column_name], errors='coerce')
        df_original_flagged.dropna(subset=[timestamp_column_name], inplace=True)
        df_original_flagged.set_index(timestamp_column_name, inplace=True)
        df_original_flagged.index.name = 'Timestamp'

    df_original_flagged['cleaning_flag_pre_clean'] = 'original'
    for col in zscore_cols_existing:
        if col in df_original_flagged.columns and df_original_flagged[col].dtype in ['float64', 'int64']:
            temp_series_orig = df_original_flagged[col].dropna()
            if not temp_series_orig.empty:
                z_scores_orig = np.abs(zscore(temp_series_orig))
                outlier_mask_orig = pd.Series(False, index=df_original_flagged.index)
                outlier_mask_orig.loc[temp_series_orig.index] = (z_scores_orig > 3)
                df_original_flagged.loc[outlier_mask_orig, 'cleaning_flag_pre_clean'] = 'outlier_flagged'
            else:
                print(f"Original column '{col}' is entirely NaN, skipping outlier flagging for comparison.")

    print("\nAverage ModA & ModB (Original Data, grouped by potential outlier flag):")
    if 'ModA' in df_original_flagged.columns and 'ModB' in df_original_flagged.columns:
        print(df_original_flagged.groupby('cleaning_flag_pre_clean')[['ModA', 'ModB']].mean())
    else:
        print("ModA or ModB not found in original data for cleaning impact analysis.")


    print("\nAverage ModA & ModB (Cleaned Data, overall mean after imputation):")
    if 'ModA' in df.columns and 'ModB' in df.columns:
        print(df[['ModA', 'ModB']].mean())
    else:
        print("ModA or ModB not found in cleaned data for cleaning impact analysis.")

    plt.figure(figsize=(14, 6))

    if 'ModA' in df.columns and 'ModA' in df_original_flagged.columns:
        plt.subplot(1, 2, 1)
        sns.histplot(df_original_flagged['ModA'].dropna(), color='skyblue', label='Original ModA', kde=True)
        sns.histplot(df['ModA'], color='orange', label='Cleaned ModA', kde=True)
        plt.title(f'Distribution of ModA (Original vs. Cleaned) for {COUNTRY_NAME}')
        plt.legend()

    if 'ModB' in df.columns and 'ModB' in df_original_flagged.columns:
        plt.subplot(1, 2, 2)
        sns.histplot(df_original_flagged['ModB'].dropna(), color='lightgreen', label='Original ModB', kde=True)
        sns.histplot(df['ModB'], color='red', label='Cleaned ModB', kde=True)
        plt.title(f'Distribution of ModB (Original vs. Cleaned) for zambia')
        plt.legend()
    plt.tight_layout()
    plt.show()

else:
    print("Cannot perform cleaning impact analysis: 'cleaning_flag', 'ModA', or 'ModB' column missing.")

# --- 7. Correlation & Relationship Analysis ---
print("\n--- Correlation & Relationship Analysis ---")

correlation_cols = ['GHI', 'DNI', 'DHI', 'ModA', 'ModB', 'Tamb', 'RH', 'WS', 'WSgust', 'WD', 'BP']
correlation_cols_existing = [col for col in correlation_cols if col in df.columns]

if len(correlation_cols_existing) > 1:
    plt.figure(figsize=(10, 8))
    sns.heatmap(df[correlation_cols_existing].corr(), annot=True, cmap='coolwarm', fmt=".2f")
    plt.title(f'Correlation Heatmap for zambia')
    plt.show()
else:
    print("Not enough numeric columns for correlation heatmap.")

scatter_plots_config = [
    {'x': 'WS', 'y': 'GHI'},
    {'x': 'WSgust', 'y': 'GHI'},
    {'x': 'WD', 'y': 'GHI'},
    {'x': 'RH', 'y': 'Tamb'},
    {'x': 'RH', 'y': 'GHI'}
]

plt.figure(figsize=(18, 12))
for i, config in enumerate(scatter_plots_config):
    x_col = config['x']
    y_col = config['y']
    if x_col in df.columns and y_col in df.columns:
        plt.subplot(2, 3, i + 1)
        sns.scatterplot(data=df, x=x_col, y=y_col, alpha=0.6)
        plt.title(f'{y_col} vs. {x_col} for zambia')
        plt.xlabel(x_col)
        plt.ylabel(y_col)
    else:
        print(f"Skipping scatter plot: '{x_col}' or '{y_col}' not found.")
plt.tight_layout()
plt.show()

# --- 8. Wind & Distribution Analysis ---
print("\n--- Wind & Distribution Analysis ---")

if 'WS' in df.columns and 'WD' in df.columns:
    try:
        ax = WindroseAxes.from_ax()
        ax.bar(df['WD'], df['WS'], normed=True, opening=0.8, edgecolor='white')
        ax.set_legend()
        plt.title(f'Wind Rose Plot for zambia')
        plt.show()
        print("Wind rose plot generated.")
    except Exception as e:
        print(f"Could not generate wind rose plot (might need 'windrose' library or data issues): {e}")
else:
    print("Skipping wind rose plot: 'WS' or 'WD' column not found.")

histogram_cols = ['GHI', 'WS']
histogram_cols_existing = [col for col in histogram_cols if col in df.columns]

plt.figure(figsize=(12, 5))
for i, col in enumerate(histogram_cols_existing):
    plt.subplot(1, len(histogram_cols_existing), i + 1)
    sns.histplot(df[col].dropna(), kde=True)
    plt.title(f'Distribution of {col} for zambia')
    plt.xlabel(col)
    plt.ylabel('Frequency')
plt.tight_layout()
plt.show()

# --- 9. Temperature Analysis ---
print("\n--- Temperature Analysis ---")

if 'RH' in df.columns and 'Tamb' in df.columns and 'GHI' in df.columns:
    print(f"Correlation between RH and Tamb: {df['RH'].corr(df['Tamb']):.2f}")
    print(f"Correlation between RH and GHI: {df['RH'].corr(df['GHI']):.2f}")


    plt.figure(figsize=(10, 7))
    sns.scatterplot(data=df, x='Tamb', y='GHI', hue='RH', palette='viridis', alpha=0.7, size='RH', sizes=(20, 400))
    plt.title(f'GHI vs. Ambient Temperature colored by Relative Humidity for zambia')
    plt.xlabel('Ambient Temperature (Tamb)')
    plt.ylabel('Global Horizontal Irradiance (GHI)')
    plt.legend(title='Relative Humidity')
    plt.show()
else:
    print("Skipping detailed temperature analysis: 'RH', 'Tamb', or 'GHI' not found.")

# --- 10. Bubble Chart ---
print("\n--- Bubble Chart Analysis ---")

if 'GHI' in df.columns and 'Tamb' in df.columns and 'RH' in df.columns:
    plt.figure(figsize=(12, 8))
    sns.scatterplot(data=df, x='Tamb', y='GHI', size='RH', hue='RH', palette='viridis', sizes=(50, 1000), alpha=0.7)
    plt.title(f'GHI vs. Ambient Temperature with Bubble Size by Relative Humidity for zambia')
    plt.xlabel('Ambient Temperature (Tamb)')
    plt.ylabel('Global Horizontal Irradiance (GHI)')
    plt.legend(title='Relative Humidity', bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.grid(True, linestyle='--', alpha=0.6)
    plt.show()
elif 'GHI' in df.columns and 'Tamb' in df.columns and 'BP' in df.columns:
    plt.figure(figsize=(12, 8))
    sns.scatterplot(data=df, x='Tamb', y='GHI', size='BP', hue='BP', palette='cividis', sizes=(50, 1000), alpha=0.7)
    plt.title(f'GHI vs. Ambient Temperature with Bubble Size by Barometric Pressure for zambia')
    plt.xlabel('Ambient Temperature (Tamb)')
    plt.ylabel('Global Horizontal Irradiance (GHI)')
    plt.legend(title='Barometric Pressure', bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.grid(True, linestyle='--', alpha=0.6)
    plt.show()
else:
    print("Cannot generate bubble chart: Missing GHI, Tamb, and either RH or BP columns.")

print(f"\n--- EDA for zambia Completed ---")

Starting EDA for zambia
Attempting to load data from: data/zambia.csv


  df = pd.read_csv("data/zambia.csv")


Data loaded successfully.
Available columns: ['time', 'dhi_pyr', 'ghi_pyr_1', 'ghi_pyr_2', 'air_temperature', 'relative_humidity', 'barometric_pressure', 'precipitation', 'wind_speed', 'wind_from_direction', 'gti_clean', 'gti_soil', 'sensor_cleaning', 'comments']

--- Initial DataFrame Info ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 550080 entries, 0 to 550079
Data columns (total 14 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   time                 550080 non-null  object 
 1   dhi_pyr              548875 non-null  float64
 2   ghi_pyr_1            550054 non-null  float64
 3   ghi_pyr_2            550054 non-null  float64
 4   air_temperature      550054 non-null  float64
 5   relative_humidity    550054 non-null  float64
 6   barometric_pressure  550054 non-null  float64
 7   precipitation        550054 non-null  float64
 8   wind_speed           550054 non-null  float64
 9   wind_from_direction  550054

<Figure size 1800x1000 with 0 Axes>


--- Cleaning Impact Analysis ---
Cannot perform cleaning impact analysis: 'cleaning_flag', 'ModA', or 'ModB' column missing.

--- Correlation & Relationship Analysis ---
Not enough numeric columns for correlation heatmap.
Skipping scatter plot: 'WS' or 'GHI' not found.
Skipping scatter plot: 'WSgust' or 'GHI' not found.
Skipping scatter plot: 'WD' or 'GHI' not found.
Skipping scatter plot: 'RH' or 'Tamb' not found.
Skipping scatter plot: 'RH' or 'GHI' not found.


<Figure size 1800x1200 with 0 Axes>


--- Wind & Distribution Analysis ---
Skipping wind rose plot: 'WS' or 'WD' column not found.


<Figure size 1200x500 with 0 Axes>


--- Temperature Analysis ---
Skipping detailed temperature analysis: 'RH', 'Tamb', or 'GHI' not found.

--- Bubble Chart Analysis ---
Cannot generate bubble chart: Missing GHI, Tamb, and either RH or BP columns.

--- EDA for zambia Completed ---
