# Solar Data EDA: Benin Analysis
**Author**: Teferi amo  
**Date**: May 18, 2025  
**Country**: Benin

In [None]:
# Import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from pathlib import Path

# Set visualization style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")
pd.set_option('display.float_format', lambda x: '%.2f' % x)

## 1. Data Loading & Initial Inspection

In [None]:
# Load data
data_path = Path('data/Togo_solar.csv')  # Update path as needed
try:
    df = pd.read_csv(data_path, parse_dates=['Timestamp'], index_col='Timestamp')
    print(f"Dataset shape: {df.shape}")
    display(df.head(3))
except FileNotFoundError:
    print(f"Error: File not found at {data_path.absolute()}")
    raise

## 2. Summary Statistics & Missing-Value Report

In [None]:
# Basic statistics
display(df.describe(percentiles=[.01, .25, .5, .75, .99]))

# Missing value analysis
missing_report = pd.DataFrame({
    'Missing Values': df.isna().sum(),
    'Missing %': (df.isna().mean() * 100).round(2)
})
display(missing_report[missing_report['Missing %'] > 5])

## 3. Outlier Detection & Cleaning

In [None]:
# Define key columns
key_columns = ['GHI', 'DNI', 'DHI', 'ModA', 'ModB', 'WS', 'WSgust']

# Calculate Z-scores
z_scores = pd.DataFrame()
for col in key_columns:
    z_scores[col] = np.abs(stats.zscore(df[col], nan_policy='omit'))

# Flag outliers
df['outlier_flag'] = (z_scores > 3).any(axis=1)

# Visualize outliers
plt.figure(figsize=(12, 6))
for i, col in enumerate(key_columns, 1):
    plt.subplot(2, 4, i)
    sns.boxplot(y=df[col])
    plt.title(col)
plt.tight_layout()
plt.show()

# Handle missing values
df_clean = df.copy()
for col in key_columns:
    df_clean[col] = df_clean[col].fillna(df_clean[col].median())

# Save cleaned data
output_path = Path('data/processed/benin_clean.csv')
output_path.parent.mkdir(exist_ok=True)
df_clean.to_csv(output_path)
print(f"Saved cleaned data to {output_path}")

## 4. Time Series Analysis

In [None]:
# Ensure datetime index
if not isinstance(df_clean.index, pd.DatetimeIndex):
    df_clean.index = pd.to_datetime(df_clean.index)

# Resample to daily averages
daily = df_clean.resample('D').mean()

# Plot time series
fig, axes = plt.subplots(4, 1, figsize=(15, 12))
variables = ['GHI', 'DNI', 'DHI', 'Tamb']
for ax, var in zip(axes, variables):
    daily[var].plot(ax=ax, title=f'Daily {var}')
    ax.set_ylabel(var)
plt.tight_layout()
plt.show()

# Monthly patterns
monthly = df_clean.groupby(df_clean.index.month).mean()
monthly[variables].plot(subplots=True, figsize=(12, 10))
plt.suptitle('Monthly Patterns')
plt.tight_layout()
plt.show()

## 5. Correlation Analysis

In [None]:
# Correlation heatmap
corr_cols = ['GHI', 'DNI', 'DHI', 'TModA', 'TModB', 'Tamb', 'RH']
plt.figure(figsize=(10, 8))
sns.heatmap(df_clean[corr_cols].corr(), annot=True, cmap='coolwarm', center=0)
plt.title('Correlation Matrix')
plt.show()

# Scatter plots
plt.figure(figsize=(15, 10))
plt.subplot(2, 2, 1)
sns.scatterplot(x='WS', y='GHI', data=df_clean, alpha=0.5)
plt.title('Wind Speed vs GHI')

plt.subplot(2, 2, 2)
sns.scatterplot(x='RH', y='Tamb', data=df_clean, alpha=0.5)
plt.title('RH vs Ambient Temp')

plt.subplot(2, 2, 3)
sns.scatterplot(x='WD', y='GHI', data=df_clean, alpha=0.5)
plt.title('Wind Direction vs GHI')

plt.subplot(2, 2, 4)
sns.scatterplot(x='RH', y='GHI', data=df_clean, alpha=0.5)
plt.title('RH vs GHI')
plt.tight_layout()
plt.show()

## 6. Wind & Distribution Analysis

In [None]:
# Wind direction histogram
plt.figure(figsize=(10, 6))
plt.hist(df_clean['WD'], bins=36, edgecolor='black')
plt.xlabel('Wind Direction (degrees)')
plt.ylabel('Frequency')
plt.title('Wind Direction Distribution')
plt.show()

# Histograms
plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
sns.histplot(df_clean['GHI'], kde=True, bins=30)
plt.title('GHI Distribution')

plt.subplot(1, 2, 2)
sns.histplot(df_clean['WS'], kde=True, bins=30)
plt.title('Wind Speed Distribution')
plt.tight_layout()
plt.show()

## 7. Key Findings & Recommendations

In [None]:
# Summary statistics
display(df_clean.describe())

# Calculate insights
ghi_mean = df_clean['GHI'].mean()
outlier_pct = df_clean['outlier_flag'].mean() * 100
sensor_diff = abs(df_clean['ModA'] - df_clean['ModB']).mean()

print(f"""
### Key Findings:

1. **Solar Potential**: 
   - Average GHI: {ghi_mean:.1f} W/m²
   - Peak GHI occurs in month {monthly['GHI'].idxmax()}

2. **Data Quality**:
   - {outlier_pct:.1f}% of readings flagged as outliers
   - Sensor consistency: ModA and ModB differ by {sensor_diff:.2f}°C on average

3. **Weather Impact**:
   - Strong {'negative' if df_clean[['GHI','RH']].corr().iloc[0,1] < 0 else 'positive'} correlation between RH and GHI

### Recommendations:

1. **Maintenance Focus**:
   - Investigate sensor drift between ModA and ModB
   - Check wind sensors during low-wind periods

2. **Performance Optimization**:
   - Schedule cleaning during low-radiation periods
   - Adjust tilt angles based on predominant wind direction
""")