In [None]:
# Cell 1: imports & load cleaned CSVs
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

sns.set(style="whitegrid")

# files expected
files = {
    "Benin": "../data/benin_clean.csv",
    "SierraLeone": "../data/sierraleone_clean.csv",
    "Togo": "../data/togo_clean.csv"
}

# load available files and warn if missing
dfs = {}
for country, path in files.items():
    if os.path.exists(path):
        df = pd.read_csv(path)
        dfs[country] = df
        print(f"Loaded {country} ({len(df)} rows).")
    else:
        print(f"WARNING: {path} not found. {country} will be skipped.")

# combine into a sisolar-measurements_sierraleone-bumbuna_qcngle DataFrame with a country column
combined = []
for country, df in dfs.items():
    temp = df.copy()
    temp['Country'] = country
    combined.append(temp)
if combined:
    df_all = pd.concat(combined, ignore_index=True)
else:
    raise FileNotFoundError("No cleaned CSVs were found in data/. Place cleaned files and rerun.")
    
# ensure numeric types for key metrics
for col in ['GHI', 'DNI', 'DHI']:
    if col in df_all.columns:
        df_all[col] = pd.to_numeric(df_all[col], errors='coerce')

df_all.info()


In [None]:
# Cell 2: Summary table comparing mean, median, std for GHI, DNI, DHI
metrics = ['GHI', 'DNI', 'DHI']
summary = df_all.groupby('Country')[metrics].agg(['mean','median','std']).round(3)
# flatten column MultiIndex
summary.columns = ['_'.join(col).strip() for col in summary.columns.values]
summary = summary.reset_index()
display(summary)

# Save summary to CSV (optional)
os.makedirs('outputs', exist_ok=True)
summary.to_csv('outputs/country_metrics_summary.csv', index=False)
print("Saved summary to outputs/country_metrics_summary.csv")


In [None]:
# Cell 3: Boxplots side-by-side (one plot per metric)
plt.rcParams.update({'figure.max_open_warning': 0})
for metric in metrics:
    plt.figure(figsize=(8,5))
    sns.boxplot(x='Country', y=metric, data=df_all, palette='Set2')
    plt.title(f'Boxplot of {metric} by Country')
    plt.ylabel(metric)
    plt.xlabel('')
    plt.tight_layout()
    plt.show()


In [None]:
# Cell 4: Statistical testing on GHI (one-way ANOVA and Kruskal-Wallis)
results = {}
for metric in metrics:
    groups = []
    group_names = []
    for country, df in dfs.items():
        # take metric series, drop NaNs
        series = pd.to_numeric(df[metric], errors='coerce').dropna()
        if len(series) > 0:
            groups.append(series)
            group_names.append(country)
    if len(groups) < 2:
        print(f"Not enough groups to test for {metric}.")
        continue

    # One-way ANOVA (assumes approximate normality / equal variances)
    try:
        f_stat, p_anova = stats.f_oneway(*groups)
    except Exception as e:
        f_stat, p_anova = (np.nan, np.nan)

    # Kruskal-Wallis (non-parametric)
    try:
        h_stat, p_kruskal = stats.kruskal(*groups)
    except Exception as e:
        h_stat, p_kruskal = (np.nan, np.nan)

    results[metric] = {
        'ANOVA_f': f_stat,
        'ANOVA_p': p_anova,
        'Kruskal_h': h_stat,
        'Kruskal_p': p_kruskal
    }

# Display results
res_df = pd.DataFrame.from_dict(results, orient='index')
res_df = res_df[['ANOVA_f','ANOVA_p','Kruskal_h','Kruskal_p']].round(6)
display(res_df)
# Save results
res_df.to_csv('outputs/stat_test_results.csv')
print("Saved statistical test results to outputs/stat_test_results.csv")


In [None]:
# Cell 5: Bar chart ranking countries by average GHI
avg_ghi = df_all.groupby('Country')['GHI'].mean().sort_values(ascending=False).reset_index()
plt.figure(figsize=(8,4))
sns.barplot(data=avg_ghi, x='Country', y='GHI', palette='Blues_d')
plt.title('Average GHI by Country')
plt.ylabel('Average GHI (W/mÂ²)')
plt.xlabel('')
for i, v in enumerate(avg_ghi['GHI'].round(2)):
    plt.text(i, v + 0.02*avg_ghi['GHI'].max(), str(round(v,2)), ha='center')
plt.tight_layout()
plt.show()
